You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

test_nlp.py 2.8 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465
  1. # Copyright 2019 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ==============================================================================
  15. import numpy as np
  16. import mindspore.dataset as ds
  17. import mindspore.dataset.text as text
  18. # this file contains "home is behind the world head" each word is 1 line
  19. DATA_FILE = "../data/dataset/testVocab/words.txt"
  20. VOCAB_FILE = "../data/dataset/testVocab/vocab_list.txt"
  21. HMM_FILE = "../data/dataset/jiebadict/hmm_model.utf8"
  22. MP_FILE = "../data/dataset/jiebadict/jieba.dict.utf8"
  23. def test_on_tokenized_line():
  24. data = ds.TextFileDataset("../data/dataset/testVocab/lines.txt", shuffle=False)
  25. jieba_op = text.JiebaTokenizer(HMM_FILE, MP_FILE, mode=text.JiebaMode.MP)
  26. with open(VOCAB_FILE, 'r') as f:
  27. for line in f:
  28. word = line.split(',')[0]
  29. jieba_op.add_word(word)
  30. data = data.map(input_columns=["text"], operations=jieba_op)
  31. vocab = text.Vocab.from_file(VOCAB_FILE, ",", special_tokens=["<pad>", "<unk>"])
  32. lookup = text.Lookup(vocab, "<unk>")
  33. data = data.map(input_columns=["text"], operations=lookup)
  34. res = np.array([[10, 1, 11, 1, 12, 1, 15, 1, 13, 1, 14],
  35. [11, 1, 12, 1, 10, 1, 14, 1, 13, 1, 15]], dtype=np.int32)
  36. for i, d in enumerate(data.create_dict_iterator()):
  37. np.testing.assert_array_equal(d["text"], res[i])
  38. def test_on_tokenized_line_with_no_special_tokens():
  39. data = ds.TextFileDataset("../data/dataset/testVocab/lines.txt", shuffle=False)
  40. jieba_op = text.JiebaTokenizer(HMM_FILE, MP_FILE, mode=text.JiebaMode.MP)
  41. with open(VOCAB_FILE, 'r') as f:
  42. for line in f:
  43. word = line.split(',')[0]
  44. jieba_op.add_word(word)
  45. data = data.map(input_columns=["text"], operations=jieba_op)
  46. vocab = text.Vocab.from_file(VOCAB_FILE, ",")
  47. lookup = text.Lookup(vocab, "not")
  48. data = data.map(input_columns=["text"], operations=lookup)
  49. res = np.array([[8, 0, 9, 0, 10, 0, 13, 0, 11, 0, 12],
  50. [9, 0, 10, 0, 8, 0, 12, 0, 11, 0, 13]], dtype=np.int32)
  51. for i, d in enumerate(data.create_dict_iterator()):
  52. np.testing.assert_array_equal(d["text"], res[i])
  53. if __name__ == '__main__':
  54. test_on_tokenized_line()
  55. test_on_tokenized_line_with_no_special_tokens()