|
- # Copyright 2020 Huawei Technologies Co., Ltd
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- # ==============================================================================
- """
- Testing WordpieceTokenizer op in DE
- """
- import numpy as np
- import mindspore.dataset as ds
- from mindspore import log as logger
- import mindspore.dataset.text as nlp
-
- WORDPIECE_TOKENIZER_FILE = "../data/dataset/testTokenizerData/wordpiece_tokenizer.txt"
-
- vocab_english = [
- "book", "cholera", "era", "favor", "##ite", "my", "is", "love", "dur", "##ing", "the"
- ]
-
- vocab_chinese = [
- "我", '最', '喜', '欢', '的', '书', '是', '霍', '乱', '时', '期', '爱', '情'
- ]
-
- vocab_mix = vocab_chinese + vocab_english
-
- test_paras = [
- dict(
- first=1,
- last=10,
- expect_str=[['my'], ['favor', '##ite'], ['book'], ['is'], ['love'], ['dur', '##ing'], ['the'], ['cholera'],
- ['era'], ['[UNK]']],
- vocab_list=vocab_english
- ),
- dict(
- first=1,
- last=10,
- expect_str=[['my'], ['favor', '##ite'], ['book'], ['is'], ['love'], ['dur', '##ing'], ['the'], ['cholera'],
- ['era'], ['what']],
- vocab_list=vocab_english,
- unknown_token=""
- ),
- dict(
- first=1,
- last=10,
- expect_str=[['my'], ['[UNK]'], ['book'], ['is'], ['love'], ['[UNK]'], ['the'], ['[UNK]'], ['era'], ['[UNK]']],
- vocab_list=vocab_english,
- max_bytes_per_token=4
- ),
- dict(
- first=11,
- last=25,
- expect_str=[['我'], ['最'], ['喜'], ['欢'], ['的'], ['书'], ['是'], ['霍'], ['乱'], ['时'], ['期'], ['的'], ['爱'], ['情'],
- ['[UNK]']],
- vocab_list=vocab_chinese,
- ),
- dict(
- first=25,
- last=25,
- expect_str=[['您']],
- vocab_list=vocab_chinese,
- unknown_token=""
- ),
- dict(
- first=1,
- last=25,
- expect_str=[
- ['my'], ['favor', '##ite'], ['book'], ['is'], ['love'], ['dur', '##ing'], ['the'], ['cholera'], ['era'],
- ['[UNK]'],
- ['我'], ['最'], ['喜'], ['欢'], ['的'], ['书'], ['是'], ['霍'], ['乱'], ['时'], ['期'], ['的'], ['爱'], ['情'],
- ['[UNK]']],
- vocab_list=vocab_mix,
- ),
- ]
-
-
- def check_wordpiece_tokenizer(first, last, expect_str, vocab_list, unknown_token='[UNK]', max_bytes_per_token=100):
- dataset = ds.TextFileDataset(WORDPIECE_TOKENIZER_FILE, shuffle=False)
- if first > 1:
- dataset = dataset.skip(first - 1)
- if last >= first:
- dataset = dataset.take(last - first + 1)
- vocab = nlp.Vocab.from_list(vocab_list)
- tokenizer_op = nlp.WordpieceTokenizer(vocab=vocab, unknown_token=unknown_token,
- max_bytes_per_token=max_bytes_per_token)
- dataset = dataset.map(operations=tokenizer_op)
- count = 0
- for i in dataset.create_dict_iterator():
- text = nlp.to_str(i['text'])
- logger.info("Out:", text)
- logger.info("Exp:", expect_str[count])
- np.testing.assert_array_equal(text, expect_str[count])
- count = count + 1
-
-
- def test_wordpiece_tokenizer():
- """
- Test WordpieceTokenizer
- """
- for paras in test_paras:
- check_wordpiece_tokenizer(**paras)
-
-
- if __name__ == '__main__':
- test_wordpiece_tokenizer()
|