|
|
|
@@ -21,7 +21,7 @@ VOCAB_FILE = "../data/dataset/test_sentencepiece/botchan.txt" |
|
|
|
DATA_FILE = "../data/dataset/testTokenizerData/sentencepiece_tokenizer.txt" |
|
|
|
|
|
|
|
|
|
|
|
def test_from_vocab_to_str(): |
|
|
|
def test_from_vocab_to_str_UNIGRAM(): |
|
|
|
vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.UNIGRAM, {}) |
|
|
|
tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING) |
|
|
|
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) |
|
|
|
@@ -33,6 +33,43 @@ def test_from_vocab_to_str(): |
|
|
|
assert value == expect[key] |
|
|
|
|
|
|
|
|
|
|
|
def test_from_vocab_to_str_BPE(): |
|
|
|
vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.BPE, {}) |
|
|
|
tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING) |
|
|
|
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) |
|
|
|
dataset = dataset.map(operations=tokenizer) |
|
|
|
expect = ['▁I', '▁saw', '▁a', '▁girl', '▁with', '▁a', '▁te', 'les', 'c', 'ope', '.'] |
|
|
|
for i in dataset.create_dict_iterator(): |
|
|
|
ret = to_str(i["text"]) |
|
|
|
for key, value in enumerate(ret): |
|
|
|
assert value == expect[key] |
|
|
|
|
|
|
|
|
|
|
|
def test_from_vocab_to_str_CHAR(): |
|
|
|
vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.CHAR, {}) |
|
|
|
tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING) |
|
|
|
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) |
|
|
|
dataset = dataset.map(operations=tokenizer) |
|
|
|
expect = ['▁', 'I', '▁', 's', 'a', 'w', '▁', 'a', '▁', 'g', 'i', 'r', 'l', '▁', 'w', 'i', 't', 'h',\ |
|
|
|
'▁', 'a', '▁', 't', 'e', 'l', 'e', 's', 'c', 'o', 'p', 'e', '.'] |
|
|
|
for i in dataset.create_dict_iterator(): |
|
|
|
ret = to_str(i["text"]) |
|
|
|
for key, value in enumerate(ret): |
|
|
|
assert value == expect[key] |
|
|
|
|
|
|
|
|
|
|
|
def test_from_vocab_to_str_WORD(): |
|
|
|
vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.WORD, {}) |
|
|
|
tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING) |
|
|
|
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) |
|
|
|
dataset = dataset.map(operations=tokenizer) |
|
|
|
expect = ['▁I', '▁saw', '▁a', '▁girl', '▁with', '▁a', '▁telescope.'] |
|
|
|
for i in dataset.create_dict_iterator(): |
|
|
|
ret = to_str(i["text"]) |
|
|
|
for key, value in enumerate(ret): |
|
|
|
assert value == expect[key] |
|
|
|
|
|
|
|
|
|
|
|
def test_from_vocab_to_int(): |
|
|
|
vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.UNIGRAM, {}) |
|
|
|
tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.INT) |
|
|
|
@@ -85,7 +122,10 @@ def test_build_from_dataset(): |
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
test_from_vocab_to_str() |
|
|
|
test_from_vocab_to_str_UNIGRAM() |
|
|
|
test_from_vocab_to_str_BPE() |
|
|
|
test_from_vocab_to_str_CHAR() |
|
|
|
test_from_vocab_to_str_WORD() |
|
|
|
test_from_vocab_to_int() |
|
|
|
test_from_file_to_str() |
|
|
|
test_from_file_to_int() |
|
|
|
|