You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cc_cleaner.py 6.9 kB

4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209
  1. import gzip
  2. import re
  3. import zhconv
  4. from trie_tree_match import Trie_tree
  5. class cleanPara(object):
  6. def __init__(self):
  7. self.rule_riwen = re.compile('[\u3040-\u31BF\u31F0-\u31FF]')
  8. #用','替代两个或以上的'_','~','-'
  9. self.rule_sub_1 = re.compile(r'(。;,、?\.)[_|~|-|——]{2,}')
  10. self.rule_sub_2 = re.compile(r'[_|~|-|——]{2,}(。;,、?\.)')
  11. self.rule_sub_3 = re.compile(r'[_|~|-|——]{2,}')
  12. #删除过多的 '.'
  13. self.rule_sub_4 = re.compile(r'[\.]{6,}')
  14. #把连续出现两次或以上的符号更替为1个 '。,?!@*'
  15. self.rule_sub_5 = re.compile(r'([。,?!@*]+?)\1+')
  16. #重复模式匹配,删除连续重复三次或以上的模式,保留一次模式
  17. self.rule_sub_6 = re.compile(r'(.+?)\1\1+')
  18. #去除html: <br><br><b><li><ol><blockquote>
  19. self.rule_sub_7 = re.compile(r'((<br>)|(<br>)|(<b>)|(<li>)|(<ol>)|(<blockquote>))')
  20. #删除多个换行符
  21. self.rule_sub_8 = re.compile(r'([\n\r]+?)\1+')
  22. # rule_remove = re.compile(r'([|\[\]【】])(.{1,15})\1(.{1,15})\1(.{1,15})\1(.{1,15})')
  23. #删除\\n的段落
  24. self.rule_remove_1 = re.compile(r'\\n')
  25. #删除太多空格的段落
  26. self.rule_remove_2 = re.compile(r'[^a-zA-Z0-9][ ][^a-zA-Z0-9]')
  27. #删除太多|的段落
  28. self.rule_remove_3 = re.compile(r'\|')
  29. def clean(self, paras):
  30. for para in paras:
  31. para = para.strip()
  32. if self.rule_riwen.search(para):
  33. pass
  34. else:
  35. para_len = len(para)
  36. if para_len>200:
  37. s = para
  38. #用','替代两个或以上的'_','~','-'
  39. s = self.rule_sub_1.sub('\1', s)
  40. s = self.rule_sub_2.sub('\1', s)
  41. s = self.rule_sub_3.sub(',', s)
  42. #删除过多的 '.'
  43. s = self.rule_sub_4.sub('', s)
  44. #把连续出现两次或以上的符号更替为1个 '。,?!@*'
  45. s = self.rule_sub_5.sub(r'\1', s)
  46. #重复模式匹配,删除重复三次或以上的模式,保留一次模式
  47. s = self.rule_sub_6.sub(r'\1', s)
  48. #去除html: <br><br><b><li><ol><blockquote>
  49. s = self.rule_sub_7.sub('', s)
  50. if self.rule_remove_1.search(s) or \
  51. len(self.rule_remove_2.findall(s))/para_len>0.05 or \
  52. len(self.rule_remove_3.findall(s))/para_len>0.05:
  53. pass
  54. else:
  55. yield zhconv.convert(s+'\n', 'zh-cn')
  56. class cleanLine(object):
  57. def __init__(self):
  58. self.rule_symbol = re.compile('[,。;]')
  59. pattern = '[\u4e00-\u9fa5\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b1234567890]'
  60. self.rule_chinese = re.compile(pattern)
  61. self.rule_feff = re.compile('<feff>')
  62. self.line_0 = ''
  63. self.line_1 = ''
  64. self.line_2 = ''
  65. def clean(self, lines):
  66. for line in lines:
  67. self.line_0 = self.line_1
  68. self.line_1 = self.line_2
  69. self.line_2 = line
  70. #删除包含<feff>的行
  71. if self.rule_feff.findall(self.line_1):
  72. yield ''
  73. self.line_1 = self.line_1.strip()
  74. chinese = self.rule_chinese.findall(self.line_0)
  75. len_line = len(self.line_1) + 0.0001
  76. len_chinese = len(chinese)
  77. rate_0 = len_chinese/len_line
  78. chinese = self.rule_chinese.findall(self.line_1)
  79. len_line = len(self.line_1) + 0.0001
  80. len_chinese = len(chinese)
  81. rate_1 = len_chinese/len_line
  82. chinese = self.rule_chinese.findall(self.line_2)
  83. len_line = len(self.line_1) + 0.0001
  84. len_chinese = len(chinese)
  85. rate_2 = len_chinese/len_line
  86. if rate_1<0.7 and (rate_0<0.7 or rate_2<0.7):
  87. yield '\n'
  88. if self.rule_symbol.search(self.line_1):
  89. yield self.line_1+'\n'
  90. elif self.rule_symbol.search(self.line_0) and self.rule_symbol.search(self.line_2):
  91. yield self.line_1+'\n'
  92. else:
  93. yield '\n'
  94. class loadPara(object):
  95. def __init__(self):
  96. self.preLine = ''
  97. def load(self, lines):
  98. res = ''
  99. for line in lines:
  100. if line == '\n' and self.preLine != '\n':
  101. yield res
  102. if line == '\n':
  103. res = ''
  104. else:
  105. res += line
  106. def readLineContainingChinese(filename):
  107. grop = re.compile('[\u4e00-\u9fa5]')
  108. with gzip.open(filename, 'rt') as f:
  109. fore_line = ''
  110. while True:
  111. line = f.readline()
  112. if line =='\n' and fore_line != '\n':
  113. yield line
  114. fore_line=line
  115. res=''
  116. all_str=res.join(grop.findall(line))
  117. if line:
  118. rate=len(all_str)/len(line)
  119. if rate>0.1:
  120. yield line
  121. if not line:
  122. break
  123. def loadSensitiveWord(path):
  124. with open(path, encoding="utf-8") as f:
  125. vocabs = f.read().splitlines()
  126. return vocabs
  127. def filter_by_trie(text, trie_model, threhold=3):
  128. """
  129. 0:dirty
  130. 1:clean
  131. """
  132. res = trie_model.find_one(text)
  133. # print(res)
  134. if len(res) > threhold:
  135. label=0
  136. return label,res
  137. elif len(res) >= 1 and len(text) < 200:
  138. label=0
  139. return label,res
  140. else:
  141. label=1
  142. return label,res
  143. def cleanSensitiveWord(trie_model, paras):
  144. for para in paras:
  145. para = para.strip()
  146. if para:
  147. label_trie,match_result = filter_by_trie(para, trie_model, 3)
  148. if label_trie:
  149. yield para + "\n"
  150. def clean(fileName,saveDir, sensitiveFile="sensitive_words-v2.txt"):
  151. lines = readLineContainingChinese(fileName)
  152. linesCleaned = cleanLine().clean(lines)
  153. paras = loadPara().load(linesCleaned)
  154. parasCleaned = cleanPara().clean(paras)
  155. dirty_vocab = loadSensitiveWord(sensitiveFile)
  156. trie_model = Trie_tree()
  157. trie_model.load_vocab(dirty_vocab)
  158. parasCleaned = cleanSensitiveWord(trie_model, parasCleaned)
  159. with open(saveDir,'w',encoding='utf-8') as fw:
  160. for para in parasCleaned:
  161. fw.write(para)
  162. fw.write('\n')
  163. if __name__ == '__main__':
  164. fileName = '/gdata/commonCrawl/common-crawl-WET-20201124-v1-ori/CC-MAIN-2019-04/CC-MAIN-20190116093643-20190116115643-00025.warc.wet.gz'
  165. saveDir = '/ghome/yands/dataset/tmp2.txt'
  166. sensitiveFile="sensitive_words-v2.txt"
  167. clean(fileName, saveDir, sensitiveFile)