You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

test_cws.py 1.7 kB

1234567891011121314151617181920212223242526272829303132333435363738394041
  1. import unittest
  2. import os
  3. from fastNLP.io.pipe.cws import CWSPipe
  4. class TestCWSPipe(unittest.TestCase):
  5. @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis")
  6. def test_process_from_file(self):
  7. dataset_names = ['pku', 'cityu', 'as', 'msra']
  8. for dataset_name in dataset_names:
  9. with self.subTest(dataset_name=dataset_name):
  10. data_bundle = CWSPipe(dataset_name=dataset_name).process_from_file()
  11. print(data_bundle)
  12. def test_demo(self):
  13. # related to issue https://github.com/fastnlp/fastNLP/issues/324#issue-705081091
  14. from fastNLP import DataSet, Instance
  15. from fastNLP.io import DataBundle
  16. data_bundle = DataBundle()
  17. ds = DataSet()
  18. ds.append(Instance(raw_words="截流 进入 最后 冲刺 ( 附 图片 1 张 )"))
  19. data_bundle.set_dataset(ds, name='train')
  20. data_bundle = CWSPipe().process(data_bundle)
  21. self.assertFalse('<' in data_bundle.get_vocab('chars'))
  22. class TestRunCWSPipe(unittest.TestCase):
  23. def test_process_from_file(self):
  24. dataset_names = ['msra', 'cityu', 'as', 'pku']
  25. for dataset_name in dataset_names:
  26. with self.subTest(dataset_name=dataset_name):
  27. data_bundle = CWSPipe(bigrams=True, trigrams=True).\
  28. process_from_file(f'tests/data_for_tests/io/cws_{dataset_name}')
  29. print(data_bundle)
  30. def test_replace_number(self):
  31. data_bundle = CWSPipe(bigrams=True, replace_num_alpha=True).\
  32. process_from_file(f'tests/data_for_tests/io/cws_pku')
  33. for word in ['<', '>', '<NUM>']:
  34. self.assertNotEqual(data_bundle.get_vocab('chars').to_index(word), 1)