import gzip
import re
import zhconv
from trie_tree_match import Trie_tree
class cleanPara(object):
def __init__(self):
self.rule_riwen = re.compile('[\u3040-\u31BF\u31F0-\u31FF]')
#用','替代两个或以上的'_','~','-'
self.rule_sub_1 = re.compile(r'(。;,、?\.)[_|~|-|——]{2,}')
self.rule_sub_2 = re.compile(r'[_|~|-|——]{2,}(。;,、?\.)')
self.rule_sub_3 = re.compile(r'[_|~|-|——]{2,}')
#删除过多的 '.'
self.rule_sub_4 = re.compile(r'[\.]{6,}')
#把连续出现两次或以上的符号更替为1个 '。,?!@*'
self.rule_sub_5 = re.compile(r'([。,?!@*]+?)\1+')
#重复模式匹配,删除连续重复三次或以上的模式,保留一次模式
self.rule_sub_6 = re.compile(r'(.+?)\1\1+')
#去除html:
self.rule_sub_7 = re.compile(r'((
)|(
)|()|(- )|(
)|())')
#删除多个换行符
self.rule_sub_8 = re.compile(r'([\n\r]+?)\1+')
# rule_remove = re.compile(r'([|\[\]【】])(.{1,15})\1(.{1,15})\1(.{1,15})\1(.{1,15})')
#删除\\n的段落
self.rule_remove_1 = re.compile(r'\\n')
#删除太多空格的段落
self.rule_remove_2 = re.compile(r'[^a-zA-Z0-9][ ][^a-zA-Z0-9]')
#删除太多|的段落
self.rule_remove_3 = re.compile(r'\|')
def clean(self, paras):
for para in paras:
para = para.strip()
if self.rule_riwen.search(para):
pass
else:
para_len = len(para)
if para_len>200:
s = para
#用','替代两个或以上的'_','~','-'
s = self.rule_sub_1.sub('\1', s)
s = self.rule_sub_2.sub('\1', s)
s = self.rule_sub_3.sub(',', s)
#删除过多的 '.'
s = self.rule_sub_4.sub('', s)
#把连续出现两次或以上的符号更替为1个 '。,?!@*'
s = self.rule_sub_5.sub(r'\1', s)
#重复模式匹配,删除重复三次或以上的模式,保留一次模式
s = self.rule_sub_6.sub(r'\1', s)
#去除html:
s = self.rule_sub_7.sub('', s)
if self.rule_remove_1.search(s) or \
len(self.rule_remove_2.findall(s))/para_len>0.05 or \
len(self.rule_remove_3.findall(s))/para_len>0.05:
pass
else:
yield zhconv.convert(s+'\n', 'zh-cn')
class cleanLine(object):
def __init__(self):
self.rule_symbol = re.compile('[,。;]')
pattern = '[\u4e00-\u9fa5\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b1234567890]'
self.rule_chinese = re.compile(pattern)
self.rule_feff = re.compile('')
self.line_0 = ''
self.line_1 = ''
self.line_2 = ''
def clean(self, lines):
for line in lines:
self.line_0 = self.line_1
self.line_1 = self.line_2
self.line_2 = line
#删除包含的行
if self.rule_feff.findall(self.line_1):
yield ''
self.line_1 = self.line_1.strip()
chinese = self.rule_chinese.findall(self.line_0)
len_line = len(self.line_1) + 0.0001
len_chinese = len(chinese)
rate_0 = len_chinese/len_line
chinese = self.rule_chinese.findall(self.line_1)
len_line = len(self.line_1) + 0.0001
len_chinese = len(chinese)
rate_1 = len_chinese/len_line
chinese = self.rule_chinese.findall(self.line_2)
len_line = len(self.line_1) + 0.0001
len_chinese = len(chinese)
rate_2 = len_chinese/len_line
if rate_1<0.7 and (rate_0<0.7 or rate_2<0.7):
yield '\n'
if self.rule_symbol.search(self.line_1):
yield self.line_1+'\n'
elif self.rule_symbol.search(self.line_0) and self.rule_symbol.search(self.line_2):
yield self.line_1+'\n'
else:
yield '\n'
class loadPara(object):
def __init__(self):
self.preLine = ''
def load(self, lines):
res = ''
for line in lines:
if line == '\n' and self.preLine != '\n':
yield res
if line == '\n':
res = ''
else:
res += line
def readLineContainingChinese(filename):
grop = re.compile('[\u4e00-\u9fa5]')
with gzip.open(filename, 'rt') as f:
fore_line = ''
while True:
line = f.readline()
if line =='\n' and fore_line != '\n':
yield line
fore_line=line
res=''
all_str=res.join(grop.findall(line))
if line:
rate=len(all_str)/len(line)
if rate>0.1:
yield line
if not line:
break
def loadSensitiveWord(path):
with open(path, encoding="utf-8") as f:
vocabs = f.read().splitlines()
return vocabs
def filter_by_trie(text, trie_model, threhold=3):
"""
0:dirty
1:clean
"""
res = trie_model.find_one(text)
# print(res)
if len(res) > threhold:
label=0
return label,res
elif len(res) >= 1 and len(text) < 200:
label=0
return label,res
else:
label=1
return label,res
def cleanSensitiveWord(trie_model, paras):
for para in paras:
para = para.strip()
if para:
label_trie,match_result = filter_by_trie(para, trie_model, 3)
if label_trie:
yield para + "\n"
def clean(fileName,saveDir, sensitiveFile="sensitive_words-v2.txt"):
lines = readLineContainingChinese(fileName)
linesCleaned = cleanLine().clean(lines)
paras = loadPara().load(linesCleaned)
parasCleaned = cleanPara().clean(paras)
dirty_vocab = loadSensitiveWord(sensitiveFile)
trie_model = Trie_tree()
trie_model.load_vocab(dirty_vocab)
parasCleaned = cleanSensitiveWord(trie_model, parasCleaned)
with open(saveDir,'w',encoding='utf-8') as fw:
for para in parasCleaned:
fw.write(para)
fw.write('\n')
if __name__ == '__main__':
fileName = '/gdata/commonCrawl/common-crawl-WET-20201124-v1-ori/CC-MAIN-2019-04/CC-MAIN-20190116093643-20190116115643-00025.warc.wet.gz'
saveDir = '/ghome/yands/dataset/tmp2.txt'
sensitiveFile="sensitive_words-v2.txt"
clean(fileName, saveDir, sensitiveFile)