|
- import gzip
- import re
- import zhconv
- from trie_tree_match import Trie_tree
-
-
- class cleanPara(object):
- def __init__(self):
- self.rule_riwen = re.compile('[\u3040-\u31BF\u31F0-\u31FF]')
- #用','替代两个或以上的'_','~','-'
- self.rule_sub_1 = re.compile(r'(。;,、?\.)[_|~|-|——]{2,}')
- self.rule_sub_2 = re.compile(r'[_|~|-|——]{2,}(。;,、?\.)')
- self.rule_sub_3 = re.compile(r'[_|~|-|——]{2,}')
- #删除过多的 '.'
- self.rule_sub_4 = re.compile(r'[\.]{6,}')
- #把连续出现两次或以上的符号更替为1个 '。,?!@*'
- self.rule_sub_5 = re.compile(r'([。,?!@*]+?)\1+')
- #重复模式匹配,删除连续重复三次或以上的模式,保留一次模式
- self.rule_sub_6 = re.compile(r'(.+?)\1\1+')
- #去除html: <br><br><b><li><ol><blockquote>
- self.rule_sub_7 = re.compile(r'((<br>)|(<br>)|(<b>)|(<li>)|(<ol>)|(<blockquote>))')
- #删除多个换行符
- self.rule_sub_8 = re.compile(r'([\n\r]+?)\1+')
- # rule_remove = re.compile(r'([|\[\]【】])(.{1,15})\1(.{1,15})\1(.{1,15})\1(.{1,15})')
- #删除\\n的段落
- self.rule_remove_1 = re.compile(r'\\n')
- #删除太多空格的段落
- self.rule_remove_2 = re.compile(r'[^a-zA-Z0-9][ ][^a-zA-Z0-9]')
- #删除太多|的段落
- self.rule_remove_3 = re.compile(r'\|')
-
- def clean(self, paras):
- for para in paras:
- para = para.strip()
-
- if self.rule_riwen.search(para):
- pass
- else:
- para_len = len(para)
- if para_len>200:
- s = para
- #用','替代两个或以上的'_','~','-'
- s = self.rule_sub_1.sub('\1', s)
- s = self.rule_sub_2.sub('\1', s)
- s = self.rule_sub_3.sub(',', s)
-
- #删除过多的 '.'
- s = self.rule_sub_4.sub('', s)
-
- #把连续出现两次或以上的符号更替为1个 '。,?!@*'
- s = self.rule_sub_5.sub(r'\1', s)
-
- #重复模式匹配,删除重复三次或以上的模式,保留一次模式
- s = self.rule_sub_6.sub(r'\1', s)
-
- #去除html: <br><br><b><li><ol><blockquote>
- s = self.rule_sub_7.sub('', s)
-
- if self.rule_remove_1.search(s) or \
- len(self.rule_remove_2.findall(s))/para_len>0.05 or \
- len(self.rule_remove_3.findall(s))/para_len>0.05:
- pass
- else:
- yield zhconv.convert(s+'\n', 'zh-cn')
-
-
- class cleanLine(object):
- def __init__(self):
- self.rule_symbol = re.compile('[,。;]')
- pattern = '[\u4e00-\u9fa5\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b1234567890]'
- self.rule_chinese = re.compile(pattern)
- self.rule_feff = re.compile('<feff>')
- self.line_0 = ''
- self.line_1 = ''
- self.line_2 = ''
-
- def clean(self, lines):
-
- for line in lines:
- self.line_0 = self.line_1
- self.line_1 = self.line_2
- self.line_2 = line
- #删除包含<feff>的行
- if self.rule_feff.findall(self.line_1):
- yield ''
- self.line_1 = self.line_1.strip()
-
- chinese = self.rule_chinese.findall(self.line_0)
- len_line = len(self.line_1) + 0.0001
- len_chinese = len(chinese)
- rate_0 = len_chinese/len_line
-
- chinese = self.rule_chinese.findall(self.line_1)
- len_line = len(self.line_1) + 0.0001
- len_chinese = len(chinese)
- rate_1 = len_chinese/len_line
-
- chinese = self.rule_chinese.findall(self.line_2)
- len_line = len(self.line_1) + 0.0001
- len_chinese = len(chinese)
- rate_2 = len_chinese/len_line
-
- if rate_1<0.7 and (rate_0<0.7 or rate_2<0.7):
- yield '\n'
-
- if self.rule_symbol.search(self.line_1):
- yield self.line_1+'\n'
- elif self.rule_symbol.search(self.line_0) and self.rule_symbol.search(self.line_2):
- yield self.line_1+'\n'
- else:
- yield '\n'
-
-
- class loadPara(object):
- def __init__(self):
- self.preLine = ''
-
- def load(self, lines):
- res = ''
- for line in lines:
- if line == '\n' and self.preLine != '\n':
- yield res
- if line == '\n':
- res = ''
- else:
- res += line
-
-
- def readLineContainingChinese(filename):
- grop = re.compile('[\u4e00-\u9fa5]')
- with gzip.open(filename, 'rt') as f:
- fore_line = ''
- while True:
- line = f.readline()
-
- if line =='\n' and fore_line != '\n':
- yield line
- fore_line=line
-
- res=''
- all_str=res.join(grop.findall(line))
- if line:
- rate=len(all_str)/len(line)
- if rate>0.1:
- yield line
- if not line:
- break
-
-
- def loadSensitiveWord(path):
- with open(path, encoding="utf-8") as f:
- vocabs = f.read().splitlines()
- return vocabs
-
-
- def filter_by_trie(text, trie_model, threhold=3):
- """
- 0:dirty
- 1:clean
- """
- res = trie_model.find_one(text)
- # print(res)
- if len(res) > threhold:
- label=0
- return label,res
- elif len(res) >= 1 and len(text) < 200:
- label=0
- return label,res
- else:
- label=1
- return label,res
-
-
- def cleanSensitiveWord(trie_model, paras):
- for para in paras:
- para = para.strip()
- if para:
- label_trie,match_result = filter_by_trie(para, trie_model, 3)
- if label_trie:
- yield para + "\n"
-
-
- def clean(fileName,saveDir, sensitiveFile="sensitive_words-v2.txt"):
- lines = readLineContainingChinese(fileName)
- linesCleaned = cleanLine().clean(lines)
- paras = loadPara().load(linesCleaned)
- parasCleaned = cleanPara().clean(paras)
-
- dirty_vocab = loadSensitiveWord(sensitiveFile)
- trie_model = Trie_tree()
- trie_model.load_vocab(dirty_vocab)
- parasCleaned = cleanSensitiveWord(trie_model, parasCleaned)
-
- with open(saveDir,'w',encoding='utf-8') as fw:
- for para in parasCleaned:
- fw.write(para)
- fw.write('\n')
-
-
-
- if __name__ == '__main__':
-
- fileName = '/gdata/commonCrawl/common-crawl-WET-20201124-v1-ori/CC-MAIN-2019-04/CC-MAIN-20190116093643-20190116115643-00025.warc.wet.gz'
- saveDir = '/ghome/yands/dataset/tmp2.txt'
- sensitiveFile="sensitive_words-v2.txt"
-
- clean(fileName, saveDir, sensitiveFile)
-
|