You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

test_glove.py 12 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236
  1. # Copyright 2021 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ==============================================================================
  15. import numpy as np
  16. import pytest
  17. from mindspore import log
  18. import mindspore.dataset as ds
  19. import mindspore.dataset.text as text
  20. import mindspore.dataset.text.transforms as T
  21. DATASET_ROOT_PATH = "../data/dataset/testGloVe/"
  22. def test_glove_all_build_from_file_params():
  23. """
  24. Feature: GloVe
  25. Description: test with all parameters which include `path` and `max_vector` in function BuildFromFile
  26. Expectation: output is equal to the expected value
  27. """
  28. vectors = text.GloVe.from_file(DATASET_ROOT_PATH + "glove.6B.test.txt", max_vectors=100)
  29. to_vectors = text.ToVectors(vectors)
  30. data = ds.TextFileDataset(DATASET_ROOT_PATH + "words.txt", shuffle=False)
  31. data = data.map(operations=to_vectors, input_columns=["text"])
  32. ind = 0
  33. res = [[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411],
  34. [0, 0, 0, 0, 0, 0],
  35. [0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973],
  36. [0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603],
  37. [0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246],
  38. [0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923],
  39. [0, 0, 0, 0, 0, 0]]
  40. for d in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  41. res_array = np.array(res[ind], dtype=np.float32)
  42. assert np.array_equal(res_array, d["text"]), ind
  43. ind += 1
  44. def test_glove_all_build_from_file_params_eager():
  45. """
  46. Feature: GloVe
  47. Description: test with all parameters which include `path` and `max_vector` in function BuildFromFile in eager mode
  48. Expectation: output is equal to the expected value
  49. """
  50. vectors = text.GloVe.from_file(DATASET_ROOT_PATH + "glove.6B.test.txt", max_vectors=4)
  51. to_vectors = T.ToVectors(vectors)
  52. result1 = to_vectors("ok")
  53. result2 = to_vectors("!")
  54. result3 = to_vectors("this")
  55. result4 = to_vectors("is")
  56. result5 = to_vectors("my")
  57. result6 = to_vectors("home")
  58. result7 = to_vectors("none")
  59. res = [[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411],
  60. [0.013441, 0.23682, -0.16899, 0.40951, 0.63812, 0.47709],
  61. [0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973],
  62. [0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603],
  63. [0, 0, 0, 0, 0, 0],
  64. [0, 0, 0, 0, 0, 0],
  65. [0, 0, 0, 0, 0, 0]]
  66. res_array = np.array(res, dtype=np.float32)
  67. assert np.array_equal(result1, res_array[0])
  68. assert np.array_equal(result2, res_array[1])
  69. assert np.array_equal(result3, res_array[2])
  70. assert np.array_equal(result4, res_array[3])
  71. assert np.array_equal(result5, res_array[4])
  72. assert np.array_equal(result6, res_array[5])
  73. assert np.array_equal(result7, res_array[6])
  74. def test_glove_all_to_vectors_params_eager():
  75. """
  76. Feature: GloVe
  77. Description: test with all parameters which include `unk_init` and `lower_case_backup` in function ToVectors
  78. in eager mode
  79. Expectation: output is equal to the expected value
  80. """
  81. vectors = text.GloVe.from_file(DATASET_ROOT_PATH + "glove.6B.test.txt", max_vectors=4)
  82. my_unk = [-1, -1, -1, -1, -1, -1]
  83. to_vectors = T.ToVectors(vectors, unk_init=my_unk, lower_case_backup=True)
  84. result1 = to_vectors("Ok")
  85. result2 = to_vectors("!")
  86. result3 = to_vectors("This")
  87. result4 = to_vectors("is")
  88. result5 = to_vectors("my")
  89. result6 = to_vectors("home")
  90. result7 = to_vectors("none")
  91. res = [[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411],
  92. [0.013441, 0.23682, -0.16899, 0.40951, 0.63812, 0.47709],
  93. [0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973],
  94. [0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603],
  95. [-1, -1, -1, -1, -1, -1],
  96. [-1, -1, -1, -1, -1, -1],
  97. [-1, -1, -1, -1, -1, -1]]
  98. res_array = np.array(res, dtype=np.float32)
  99. assert np.array_equal(result1, res_array[0])
  100. assert np.array_equal(result2, res_array[1])
  101. assert np.array_equal(result3, res_array[2])
  102. assert np.array_equal(result4, res_array[3])
  103. assert np.array_equal(result5, res_array[4])
  104. assert np.array_equal(result6, res_array[5])
  105. assert np.array_equal(result7, res_array[6])
  106. def test_glove_build_from_file():
  107. """
  108. Feature: GloVe
  109. Description: test with only default parameter
  110. Expectation: output is equal to the expected value
  111. """
  112. vectors = text.GloVe.from_file(DATASET_ROOT_PATH + "glove.6B.test.txt")
  113. to_vectors = text.ToVectors(vectors)
  114. data = ds.TextFileDataset(DATASET_ROOT_PATH + "words.txt", shuffle=False)
  115. data = data.map(operations=to_vectors, input_columns=["text"])
  116. ind = 0
  117. res = [[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411],
  118. [0, 0, 0, 0, 0, 0],
  119. [0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973],
  120. [0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603],
  121. [0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246],
  122. [0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923],
  123. [0, 0, 0, 0, 0, 0]]
  124. for d in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  125. res_array = np.array(res[ind], dtype=np.float32)
  126. assert np.array_equal(res_array, d["text"]), ind
  127. ind += 1
  128. def test_glove_build_from_file_eager():
  129. """
  130. Feature: GloVe
  131. Description: test with only default parameter in eager mode
  132. Expectation: output is equal to the expected value
  133. """
  134. vectors = text.GloVe.from_file(DATASET_ROOT_PATH + "glove.6B.test.txt")
  135. to_vectors = T.ToVectors(vectors)
  136. result1 = to_vectors("ok")
  137. result2 = to_vectors("!")
  138. result3 = to_vectors("this")
  139. result4 = to_vectors("is")
  140. result5 = to_vectors("my")
  141. result6 = to_vectors("home")
  142. result7 = to_vectors("none")
  143. res = [[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411],
  144. [0.013441, 0.23682, -0.16899, 0.40951, 0.63812, 0.47709],
  145. [0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973],
  146. [0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603],
  147. [0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246],
  148. [0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923],
  149. [0, 0, 0, 0, 0, 0]]
  150. res_array = np.array(res, dtype=np.float32)
  151. assert np.array_equal(result1, res_array[0])
  152. assert np.array_equal(result2, res_array[1])
  153. assert np.array_equal(result3, res_array[2])
  154. assert np.array_equal(result4, res_array[3])
  155. assert np.array_equal(result5, res_array[4])
  156. assert np.array_equal(result6, res_array[5])
  157. assert np.array_equal(result7, res_array[6])
  158. def test_glove_invalid_input():
  159. """
  160. Feature: GloVe
  161. Description: test the validate function with invalid parameters
  162. Expectation: output is equal to the expected error
  163. """
  164. def test_invalid_input(test_name, file_path, error, error_msg, max_vectors=None, unk_init=None,
  165. lower_case_backup=False, token="ok"):
  166. log.info("Test Vectors with wrong input: {0}".format(test_name))
  167. with pytest.raises(error) as error_info:
  168. vectors = text.GloVe.from_file(file_path, max_vectors=max_vectors)
  169. to_vectors = T.ToVectors(vectors, unk_init=unk_init, lower_case_backup=lower_case_backup)
  170. to_vectors(token)
  171. assert error_msg in str(error_info.value)
  172. test_invalid_input("Not all vectors have the same number of dimensions",
  173. DATASET_ROOT_PATH + "glove.6B.dim_different.txt", error=RuntimeError,
  174. error_msg="all vectors must have the same number of dimensions, " \
  175. "but got dim 5 while expecting 6")
  176. test_invalid_input("the file is empty.", DATASET_ROOT_PATH + "glove.6B.empty.txt",
  177. error=RuntimeError, error_msg="invalid file, file is empty.")
  178. test_invalid_input("the count of `unknown_init`'s element is different with word vector.",
  179. DATASET_ROOT_PATH + "glove.6B.test.txt",
  180. error=RuntimeError,
  181. error_msg="unk_init must be the same length as vectors, but got unk_init",
  182. unk_init=[-1, -1])
  183. test_invalid_input("The file not exist", DATASET_ROOT_PATH + "not_exist.txt", RuntimeError,
  184. error_msg="GloVe: invalid file")
  185. test_invalid_input("The token is 1-dimensional", DATASET_ROOT_PATH + "glove.6B.with_wrong_info.txt",
  186. error=RuntimeError, error_msg="token with 1-dimensional vector.")
  187. test_invalid_input("max_vectors parameter must be greater than 0", DATASET_ROOT_PATH + "glove.6B.test.txt",
  188. error=ValueError, error_msg="Input max_vectors is not within the required interval",
  189. max_vectors=-1)
  190. test_invalid_input("invalid max_vectors parameter type as a float", DATASET_ROOT_PATH + "glove.6B.test.txt",
  191. error=TypeError, error_msg="Argument max_vectors with value 1.0 is not of type [<class 'int'>],"
  192. " but got <class 'float'>.", max_vectors=1.0)
  193. test_invalid_input("invalid max_vectors parameter type as a string", DATASET_ROOT_PATH + "glove.6B.test.txt",
  194. error=TypeError, error_msg="Argument max_vectors with value 1 is not of type [<class 'int'>],"
  195. " but got <class 'str'>.", max_vectors="1")
  196. test_invalid_input("invalid token parameter type as a float", DATASET_ROOT_PATH + "glove.6B.test.txt",
  197. error=RuntimeError, error_msg="input tensor type should be string.", token=1.0)
  198. test_invalid_input("invalid lower_case_backup parameter type as a string", DATASET_ROOT_PATH + "glove.6B.test.txt",
  199. error=TypeError, error_msg="Argument lower_case_backup with value True is " \
  200. "not of type [<class 'bool'>],"
  201. " but got <class 'str'>.", lower_case_backup="True")
  202. test_invalid_input("invalid lower_case_backup parameter type as a string", DATASET_ROOT_PATH + "glove.6B.test.txt",
  203. error=TypeError, error_msg="Argument lower_case_backup with value True is " \
  204. "not of type [<class 'bool'>],"
  205. " but got <class 'str'>.", lower_case_backup="True")
  206. test_invalid_input("not right glove dataset. The formal must be `glove.6B.*.txt`", DATASET_ROOT_PATH +
  207. "glove.6B.test.vec", error=RuntimeError, error_msg="GloVe: invalid file, can not " \
  208. "find file 'glove.6B.*.txt'")
  209. if __name__ == '__main__':
  210. test_glove_all_build_from_file_params()
  211. test_glove_all_build_from_file_params_eager()
  212. test_glove_all_to_vectors_params_eager()
  213. test_glove_build_from_file()
  214. test_glove_build_from_file_eager()
  215. test_glove_invalid_input()