You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

test_char_n_gram.py 11 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217
  1. # Copyright 2021 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ==============================================================================
  15. import numpy as np
  16. import pytest
  17. from mindspore import log
  18. import mindspore.dataset as ds
  19. import mindspore.dataset.text as text
  20. import mindspore.dataset.text.transforms as T
  21. DATASET_ROOT_PATH = "../data/dataset/testVectors/"
  22. def _count_unequal_element(data_expected, data_me, rtol, atol):
  23. assert data_expected.shape == data_me.shape
  24. total_count = len(data_expected.flatten())
  25. error = np.abs(data_expected - data_me)
  26. greater = np.greater(error, atol + np.abs(data_expected)*rtol)
  27. loss_count = np.count_nonzero(greater)
  28. assert (loss_count/total_count) < rtol,\
  29. "\ndata_expected_std:{0}\ndata_me_error:{1}\nloss:{2}".\
  30. format(data_expected[greater], data_me[greater], error[greater])
  31. def allclose_nparray(data_expected, data_me, rtol, atol, equal_nan=True):
  32. if np.any(np.isnan(data_expected)):
  33. assert np.allclose(data_me, data_expected, rtol, atol, equal_nan=equal_nan)
  34. elif not np.allclose(data_me, data_expected, rtol, atol, equal_nan=equal_nan):
  35. _count_unequal_element(data_expected, data_me, rtol, atol)
  36. else:
  37. assert True
  38. def test_char_n_gram_all_to_vectors_params_eager():
  39. """
  40. Feature: CharNGram
  41. Description: test with all parameters which include `unk_init`
  42. and `lower_case_backup` in function ToVectors in eager mode
  43. Expectation: output is equal to the expected value
  44. """
  45. char_n_gram = text.CharNGram.from_file(DATASET_ROOT_PATH + "char_n_gram_20.txt", max_vectors=18)
  46. unk_init = (-np.ones(5)).tolist()
  47. to_vectors = T.ToVectors(char_n_gram, unk_init=unk_init, lower_case_backup=True)
  48. result1 = to_vectors("THE")
  49. result2 = to_vectors(".")
  50. result3 = to_vectors("To")
  51. res = [[-1.34121733e+00, 4.42693333e-02, -4.86969667e-01, 6.62939000e-01, -3.67669000e-01],
  52. [-1.00000000e+00, -1.00000000e+00, -1.00000000e+00, -1.00000000e+00, -1.00000000e+00],
  53. [-9.68530000e-01, -7.89463000e-01, 5.15762000e-01, 2.02107000e+00, -1.64635000e+00]]
  54. res_array = np.array(res, dtype=np.float32)
  55. allclose_nparray(res_array[0], result1, 0.0001, 0.0001)
  56. allclose_nparray(res_array[1], result2, 0.0001, 0.0001)
  57. allclose_nparray(res_array[2], result3, 0.0001, 0.0001)
  58. def test_char_n_gram_build_from_file():
  59. """
  60. Feature: CharNGram
  61. Description: test with only default parameter
  62. Expectation: output is equal to the expected value
  63. """
  64. char_n_gram = text.CharNGram.from_file(DATASET_ROOT_PATH + "char_n_gram_20.txt")
  65. to_vectors = text.ToVectors(char_n_gram)
  66. data = ds.TextFileDataset(DATASET_ROOT_PATH + "words.txt", shuffle=False)
  67. data = data.map(operations=to_vectors, input_columns=["text"])
  68. ind = 0
  69. res = [[0., 0., 0., 0., 0.],
  70. [0., 0., 0., 0., 0.],
  71. [0.117336, 0.362446, -0.983326, 0.939264, -0.05648],
  72. [0.657201, 2.11761, -1.59276, 0.432072, 1.21395],
  73. [0., 0., 0., 0., 0.],
  74. [-2.26956, 0.288491, -0.740001, 0.661703, 0.147355],
  75. [0., 0., 0., 0., 0.]]
  76. for d in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  77. res_array = np.array(res[ind], dtype=np.float32)
  78. allclose_nparray(res_array, d["text"], 0.0001, 0.0001)
  79. ind += 1
  80. def test_char_n_gram_all_build_from_file_params():
  81. """
  82. Feature: CharNGram
  83. Description: test with all parameters which include `path` and `max_vector` in function BuildFromFile
  84. Expectation: output is equal to the expected value
  85. """
  86. char_n_gram = text.CharNGram.from_file(DATASET_ROOT_PATH + "char_n_gram_20.txt", max_vectors=100)
  87. to_vectors = text.ToVectors(char_n_gram)
  88. data = ds.TextFileDataset(DATASET_ROOT_PATH + "words.txt", shuffle=False)
  89. data = data.map(operations=to_vectors, input_columns=["text"])
  90. ind = 0
  91. res = [[0., 0., 0., 0., 0.],
  92. [0., 0., 0., 0., 0.],
  93. [0.117336, 0.362446, -0.983326, 0.939264, -0.05648],
  94. [0.657201, 2.11761, -1.59276, 0.432072, 1.21395],
  95. [0., 0., 0., 0., 0.],
  96. [-2.26956, 0.288491, -0.740001, 0.661703, 0.147355],
  97. [0., 0., 0., 0., 0.]]
  98. for d in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  99. res_array = np.array(res[ind], dtype=np.float32)
  100. allclose_nparray(res_array, d["text"], 0.0001, 0.0001)
  101. ind += 1
  102. def test_char_n_gram_all_build_from_file_params_eager():
  103. """
  104. Feature: CharNGram
  105. Description: test with all parameters which include `path` and `max_vector` in function BuildFromFile in eager mode
  106. Expectation: output is equal to the expected value
  107. """
  108. char_n_gram = text.CharNGram.from_file(DATASET_ROOT_PATH + "char_n_gram_20.txt", max_vectors=18)
  109. to_vectors = T.ToVectors(char_n_gram)
  110. result1 = to_vectors("the")
  111. result2 = to_vectors(".")
  112. result3 = to_vectors("to")
  113. res = [[-1.34121733e+00, 4.42693333e-02, -4.86969667e-01, 6.62939000e-01, -3.67669000e-01],
  114. [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
  115. [-9.68530000e-01, -7.89463000e-01, 5.15762000e-01, 2.02107000e+00, -1.64635000e+00]]
  116. res_array = np.array(res, dtype=np.float32)
  117. allclose_nparray(res_array[0], result1, 0.0001, 0.0001)
  118. allclose_nparray(res_array[1], result2, 0.0001, 0.0001)
  119. allclose_nparray(res_array[2], result3, 0.0001, 0.0001)
  120. def test_char_n_gram_build_from_file_eager():
  121. """
  122. Feature: CharNGram
  123. Description: test with only default parameter in eager mode
  124. Expectation: output is equal to the expected value
  125. """
  126. char_n_gram = text.CharNGram.from_file(DATASET_ROOT_PATH + "char_n_gram_20.txt")
  127. to_vectors = T.ToVectors(char_n_gram)
  128. result1 = to_vectors("the")
  129. result2 = to_vectors(".")
  130. result3 = to_vectors("to")
  131. res = [[-8.40079000e-01, -2.70002500e-02, -8.33472250e-01, 5.88367000e-01, -2.10011750e-01],
  132. [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
  133. [-9.68530000e-01, -7.89463000e-01, 5.15762000e-01, 2.02107000e+00, -1.64635000e+00]]
  134. res_array = np.array(res, dtype=np.float32)
  135. allclose_nparray(res_array[0], result1, 0.0001, 0.0001)
  136. allclose_nparray(res_array[1], result2, 0.0001, 0.0001)
  137. allclose_nparray(res_array[2], result3, 0.0001, 0.0001)
  138. def test_char_n_gram_invalid_input():
  139. """
  140. Feature: CharNGram
  141. Description: test the validate function with invalid parameters.
  142. Expectation: Verification of correct error message for invalid input.
  143. """
  144. def test_invalid_input(test_name, file_path, error, error_msg, max_vectors=None,
  145. unk_init=None, lower_case_backup=False, token="ok"):
  146. log.info("Test CharNGram with wrong input: {0}".format(test_name))
  147. with pytest.raises(error) as error_info:
  148. char_n_gram = text.CharNGram.from_file(file_path, max_vectors=max_vectors)
  149. to_vectors = T.ToVectors(char_n_gram, unk_init=unk_init, lower_case_backup=lower_case_backup)
  150. to_vectors(token)
  151. assert error_msg in str(error_info.value)
  152. test_invalid_input("Not all vectors have the same number of dimensions",
  153. DATASET_ROOT_PATH + "char_n_gram_20_dim_different.txt", error=RuntimeError,
  154. error_msg="all vectors must have the same number of dimensions, " +
  155. "but got dim 4 while expecting 5")
  156. test_invalid_input("the file is empty.", DATASET_ROOT_PATH + "vectors_empty.txt",
  157. error=RuntimeError, error_msg="invalid file, file is empty.")
  158. test_invalid_input("the count of `unknown_init`'s element is different with word vector.",
  159. DATASET_ROOT_PATH + "char_n_gram_20.txt",
  160. error=RuntimeError, error_msg="unk_init must be the same length as vectors, " +
  161. "but got unk_init: 6 and vectors: 5", unk_init=np.ones(6).tolist())
  162. test_invalid_input("The file not exist", DATASET_ROOT_PATH + "not_exist.txt", RuntimeError,
  163. error_msg="get real path failed")
  164. test_invalid_input("max_vectors parameter must be greater than 0",
  165. DATASET_ROOT_PATH + "char_n_gram_20.txt", error=ValueError,
  166. error_msg="Input max_vectors is not within the required interval", max_vectors=-1)
  167. test_invalid_input("invalid max_vectors parameter type as a float",
  168. DATASET_ROOT_PATH + "char_n_gram_20.txt", error=TypeError,
  169. error_msg="Argument max_vectors with value 1.0 is not of type [<class 'int'>],"
  170. " but got <class 'float'>.", max_vectors=1.0)
  171. test_invalid_input("invalid max_vectors parameter type as a string",
  172. DATASET_ROOT_PATH + "char_n_gram_20.txt", error=TypeError,
  173. error_msg="Argument max_vectors with value 1 is not of type [<class 'int'>],"
  174. " but got <class 'str'>.", max_vectors="1")
  175. test_invalid_input("invalid token parameter type as a float",
  176. DATASET_ROOT_PATH + "char_n_gram_20.txt", error=RuntimeError,
  177. error_msg="input tensor type should be string.", token=1.0)
  178. test_invalid_input("invalid lower_case_backup parameter type as a string", DATASET_ROOT_PATH + "char_n_gram_20.txt",
  179. error=TypeError, error_msg="Argument lower_case_backup with " +
  180. "value True is not of type [<class 'bool'>],"
  181. " but got <class 'str'>.", lower_case_backup="True")
  182. test_invalid_input("invalid lower_case_backup parameter type as a string", DATASET_ROOT_PATH + "char_n_gram_20.txt",
  183. error=TypeError, error_msg="Argument lower_case_backup with " +
  184. "value True is not of type [<class 'bool'>],"
  185. " but got <class 'str'>.", lower_case_backup="True")
  186. if __name__ == '__main__':
  187. test_char_n_gram_all_to_vectors_params_eager()
  188. test_char_n_gram_build_from_file()
  189. test_char_n_gram_all_build_from_file_params()
  190. test_char_n_gram_all_build_from_file_params_eager()
  191. test_char_n_gram_build_from_file_eager()
  192. test_char_n_gram_invalid_input()