You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

test_dataset.py 8.8 kB

6 years ago
7 years ago
7 years ago
6 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220
  1. import os
  2. import unittest
  3. from fastNLP import DataSet
  4. from fastNLP import FieldArray
  5. from fastNLP import Instance
  6. from fastNLP.io import CSVLoader
  7. class TestDataSetInit(unittest.TestCase):
  8. """初始化DataSet的办法有以下几种:
  9. 1) 用dict:
  10. 1.1) 二维list DataSet({"x": [[1, 2], [3, 4]]})
  11. 1.2) 二维array DataSet({"x": np.array([[1, 2], [3, 4]])})
  12. 1.3) 三维list DataSet({"x": [[[1, 2], [3, 4]], [[1, 2], [3, 4]]]})
  13. 2) 用list of Instance:
  14. 2.1) 一维list DataSet([Instance(x=[1, 2, 3, 4])])
  15. 2.2) 一维array DataSet([Instance(x=np.array([1, 2, 3, 4]))])
  16. 2.3) 二维list DataSet([Instance(x=[[1, 2], [3, 4]])])
  17. 2.4) 二维array DataSet([Instance(x=np.array([[1, 2], [3, 4]]))])
  18. 只接受纯list或者最外层ndarray
  19. """
  20. def test_init_v1(self):
  21. # 一维list
  22. ds = DataSet([Instance(x=[1, 2, 3, 4], y=[5, 6])] * 40)
  23. self.assertTrue("x" in ds.field_arrays and "y" in ds.field_arrays)
  24. self.assertEqual(ds.field_arrays["x"].content, [[1, 2, 3, 4], ] * 40)
  25. self.assertEqual(ds.field_arrays["y"].content, [[5, 6], ] * 40)
  26. def test_init_v2(self):
  27. # 用dict
  28. ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40})
  29. self.assertTrue("x" in ds.field_arrays and "y" in ds.field_arrays)
  30. self.assertEqual(ds.field_arrays["x"].content, [[1, 2, 3, 4], ] * 40)
  31. self.assertEqual(ds.field_arrays["y"].content, [[5, 6], ] * 40)
  32. def test_init_assert(self):
  33. with self.assertRaises(AssertionError):
  34. _ = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 100})
  35. with self.assertRaises(AssertionError):
  36. _ = DataSet([[1, 2, 3, 4]] * 10)
  37. with self.assertRaises(ValueError):
  38. _ = DataSet(0.00001)
  39. class TestDataSetMethods(unittest.TestCase):
  40. def test_append(self):
  41. dd = DataSet()
  42. for _ in range(3):
  43. dd.append(Instance(x=[1, 2, 3, 4], y=[5, 6]))
  44. self.assertEqual(len(dd), 3)
  45. self.assertEqual(dd.field_arrays["x"].content, [[1, 2, 3, 4]] * 3)
  46. self.assertEqual(dd.field_arrays["y"].content, [[5, 6]] * 3)
  47. def test_add_field(self):
  48. dd = DataSet()
  49. dd.add_field("x", [[1, 2, 3]] * 10)
  50. dd.add_field("y", [[1, 2, 3, 4]] * 10)
  51. dd.add_field("z", [[5, 6]] * 10)
  52. self.assertEqual(len(dd), 10)
  53. self.assertEqual(dd.field_arrays["x"].content, [[1, 2, 3]] * 10)
  54. self.assertEqual(dd.field_arrays["y"].content, [[1, 2, 3, 4]] * 10)
  55. self.assertEqual(dd.field_arrays["z"].content, [[5, 6]] * 10)
  56. with self.assertRaises(RuntimeError):
  57. dd.add_field("??", [[1, 2]] * 40)
  58. def test_add_field_ignore_type(self):
  59. dd = DataSet()
  60. dd.add_field("x", [(1, "1"), (2, "2"), (3, "3"), (4, "4")], ignore_type=True, is_target=True)
  61. dd.add_field("y", [{1, "1"}, {2, "2"}, {3, "3"}, {4, "4"}], ignore_type=True, is_target=True)
  62. def test_delete_field(self):
  63. dd = DataSet()
  64. dd.add_field("x", [[1, 2, 3]] * 10)
  65. dd.add_field("y", [[1, 2, 3, 4]] * 10)
  66. dd.delete_field("x")
  67. self.assertFalse("x" in dd.field_arrays)
  68. self.assertTrue("y" in dd.field_arrays)
  69. def test_getitem(self):
  70. ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40})
  71. ins_1, ins_0 = ds[0], ds[1]
  72. self.assertTrue(isinstance(ins_1, Instance) and isinstance(ins_0, Instance))
  73. self.assertEqual(ins_1["x"], [1, 2, 3, 4])
  74. self.assertEqual(ins_1["y"], [5, 6])
  75. self.assertEqual(ins_0["x"], [1, 2, 3, 4])
  76. self.assertEqual(ins_0["y"], [5, 6])
  77. sub_ds = ds[:10]
  78. self.assertTrue(isinstance(sub_ds, DataSet))
  79. self.assertEqual(len(sub_ds), 10)
  80. def test_get_item_error(self):
  81. with self.assertRaises(RuntimeError):
  82. ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10})
  83. _ = ds[40:]
  84. with self.assertRaises(KeyError):
  85. ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10})
  86. _ = ds["kom"]
  87. def test_len_(self):
  88. ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40})
  89. self.assertEqual(len(ds), 40)
  90. ds = DataSet()
  91. self.assertEqual(len(ds), 0)
  92. def test_apply(self):
  93. ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40})
  94. ds.apply(lambda ins: ins["x"][::-1], new_field_name="rx")
  95. self.assertTrue("rx" in ds.field_arrays)
  96. self.assertEqual(ds.field_arrays["rx"].content[0], [4, 3, 2, 1])
  97. ds.apply(lambda ins: len(ins["y"]), new_field_name="y")
  98. self.assertEqual(ds.field_arrays["y"].content[0], 2)
  99. res = ds.apply(lambda ins: len(ins["x"]))
  100. self.assertTrue(isinstance(res, list) and len(res) > 0)
  101. self.assertTrue(res[0], 4)
  102. ds.apply(lambda ins: (len(ins["x"]), "hahaha"), new_field_name="k", ignore_type=True)
  103. # expect no exception raised
  104. def test_drop(self):
  105. ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6], [7, 8, 9, 0]] * 20})
  106. ds.drop(lambda ins: len(ins["y"]) < 3, inplace=True)
  107. self.assertEqual(len(ds), 20)
  108. def test_contains(self):
  109. ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40})
  110. self.assertTrue("x" in ds)
  111. self.assertTrue("y" in ds)
  112. self.assertFalse("z" in ds)
  113. def test_rename_field(self):
  114. ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10})
  115. ds.rename_field("x", "xx")
  116. self.assertTrue("xx" in ds)
  117. self.assertFalse("x" in ds)
  118. with self.assertRaises(KeyError):
  119. ds.rename_field("yyy", "oo")
  120. def test_input_target(self):
  121. ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10})
  122. ds.set_input("x")
  123. ds.set_target("y")
  124. self.assertTrue(ds.field_arrays["x"].is_input)
  125. self.assertTrue(ds.field_arrays["y"].is_target)
  126. with self.assertRaises(KeyError):
  127. ds.set_input("xxx")
  128. with self.assertRaises(KeyError):
  129. ds.set_input("yyy")
  130. def test_get_input_name(self):
  131. ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10})
  132. self.assertEqual(ds.get_input_name(), [_ for _ in ds.field_arrays if ds.field_arrays[_].is_input])
  133. def test_get_target_name(self):
  134. ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10})
  135. self.assertEqual(ds.get_target_name(), [_ for _ in ds.field_arrays if ds.field_arrays[_].is_target])
  136. def test_split(self):
  137. ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10})
  138. d1, d2 = ds.split(0.1)
  139. def test_apply2(self):
  140. def split_sent(ins):
  141. return ins['raw_sentence'].split()
  142. csv_loader = CSVLoader(headers=['raw_sentence', 'label'],sep='\t')
  143. dataset = csv_loader.load('test/data_for_tests/tutorial_sample_dataset.csv')
  144. dataset.drop(lambda x: len(x['raw_sentence'].split()) == 0, inplace=True)
  145. dataset.apply(split_sent, new_field_name='words', is_input=True)
  146. # print(dataset)
  147. def test_add_field_v2(self):
  148. ds = DataSet({"x": [3, 4]})
  149. ds.add_field('y', [['hello', 'world'], ['this', 'is', 'a', 'test']], is_input=True, is_target=True)
  150. # ds.apply(lambda x:[x['x']]*3, is_input=True, is_target=True, new_field_name='y')
  151. print(ds)
  152. def test_save_load(self):
  153. ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10})
  154. ds.save("./my_ds.pkl")
  155. self.assertTrue(os.path.exists("./my_ds.pkl"))
  156. ds_1 = DataSet.load("./my_ds.pkl")
  157. os.remove("my_ds.pkl")
  158. def test_get_all_fields(self):
  159. ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10})
  160. ans = ds.get_all_fields()
  161. self.assertEqual(ans["x"].content, [[1, 2, 3, 4]] * 10)
  162. self.assertEqual(ans["y"].content, [[5, 6]] * 10)
  163. def test_get_field(self):
  164. ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10})
  165. ans = ds.get_field("x")
  166. self.assertTrue(isinstance(ans, FieldArray))
  167. self.assertEqual(ans.content, [[1, 2, 3, 4]] * 10)
  168. ans = ds.get_field("y")
  169. self.assertTrue(isinstance(ans, FieldArray))
  170. self.assertEqual(ans.content, [[5, 6]] * 10)
  171. def test_add_null(self):
  172. # TODO test failed because 'fastNLP\core\field.py:143: RuntimeError'
  173. ds = DataSet()
  174. with self.assertRaises(RuntimeError) as RE:
  175. ds.add_field('test', [])
  176. class TestDataSetIter(unittest.TestCase):
  177. def test__repr__(self):
  178. ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10})
  179. for iter in ds:
  180. self.assertEqual(iter.__repr__(), "{'x': [1, 2, 3, 4] type=list,\n'y': [5, 6] type=list}")