You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

base_loader.py 903 B

8 years ago
12345678910111213141516171819202122232425262728293031323334353637
  1. class BaseLoader(object):
  2. """docstring for BaseLoader"""
  3. def __init__(self, data_name, data_path):
  4. super(BaseLoader, self).__init__()
  5. self.data_name = data_name
  6. self.data_path = data_path
  7. def load(self):
  8. """
  9. :return: string
  10. """
  11. with open(self.data_path, "r", encoding="utf-8") as f:
  12. text = f.read()
  13. return text
  14. def load_lines(self):
  15. with open(self.data_path, "r", encoding="utf=8") as f:
  16. text = f.readlines()
  17. return text
  18. class ToyLoader0(BaseLoader):
  19. """
  20. For charLM
  21. """
  22. def __init__(self, name, path):
  23. super(ToyLoader0, self).__init__(name, path)
  24. def load(self):
  25. with open(self.data_path, 'r') as f:
  26. corpus = f.read().lower()
  27. import re
  28. corpus = re.sub(r"<unk>", "unk", corpus)
  29. return corpus.split()

一款轻量级的自然语言处理(NLP)工具包,目标是减少用户项目中的工程型代码,例如数据处理循环、训练循环、多卡运行等