import gzip import re import zhconv from trie_tree_match import Trie_tree class cleanPara(object): def __init__(self): self.rule_riwen = re.compile('[\u3040-\u31BF\u31F0-\u31FF]') #用','替代两个或以上的'_','~','-' self.rule_sub_1 = re.compile(r'(。;,、?\.)[_|~|-|——]{2,}') self.rule_sub_2 = re.compile(r'[_|~|-|——]{2,}(。;,、?\.)') self.rule_sub_3 = re.compile(r'[_|~|-|——]{2,}') #删除过多的 '.' self.rule_sub_4 = re.compile(r'[\.]{6,}') #把连续出现两次或以上的符号更替为1个 '。,?!@*' self.rule_sub_5 = re.compile(r'([。,?!@*]+?)\1+') #重复模式匹配,删除连续重复三次或以上的模式,保留一次模式 self.rule_sub_6 = re.compile(r'(.+?)\1\1+') #去除html:

    1. self.rule_sub_7 = re.compile(r'((
      )|(
      )|()|(
    2. )|(
        )|(
        ))') #删除多个换行符 self.rule_sub_8 = re.compile(r'([\n\r]+?)\1+') # rule_remove = re.compile(r'([|\[\]【】])(.{1,15})\1(.{1,15})\1(.{1,15})\1(.{1,15})') #删除\\n的段落 self.rule_remove_1 = re.compile(r'\\n') #删除太多空格的段落 self.rule_remove_2 = re.compile(r'[^a-zA-Z0-9][ ][^a-zA-Z0-9]') #删除太多|的段落 self.rule_remove_3 = re.compile(r'\|') def clean(self, paras): for para in paras: para = para.strip() if self.rule_riwen.search(para): pass else: para_len = len(para) if para_len>200: s = para #用','替代两个或以上的'_','~','-' s = self.rule_sub_1.sub('\1', s) s = self.rule_sub_2.sub('\1', s) s = self.rule_sub_3.sub(',', s) #删除过多的 '.' s = self.rule_sub_4.sub('', s) #把连续出现两次或以上的符号更替为1个 '。,?!@*' s = self.rule_sub_5.sub(r'\1', s) #重复模式匹配,删除重复三次或以上的模式,保留一次模式 s = self.rule_sub_6.sub(r'\1', s) #去除html:

        1. s = self.rule_sub_7.sub('', s) if self.rule_remove_1.search(s) or \ len(self.rule_remove_2.findall(s))/para_len>0.05 or \ len(self.rule_remove_3.findall(s))/para_len>0.05: pass else: yield zhconv.convert(s+'\n', 'zh-cn') class cleanLine(object): def __init__(self): self.rule_symbol = re.compile('[,。;]') pattern = '[\u4e00-\u9fa5\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b1234567890]' self.rule_chinese = re.compile(pattern) self.rule_feff = re.compile('') self.line_0 = '' self.line_1 = '' self.line_2 = '' def clean(self, lines): for line in lines: self.line_0 = self.line_1 self.line_1 = self.line_2 self.line_2 = line #删除包含的行 if self.rule_feff.findall(self.line_1): yield '' self.line_1 = self.line_1.strip() chinese = self.rule_chinese.findall(self.line_0) len_line = len(self.line_1) + 0.0001 len_chinese = len(chinese) rate_0 = len_chinese/len_line chinese = self.rule_chinese.findall(self.line_1) len_line = len(self.line_1) + 0.0001 len_chinese = len(chinese) rate_1 = len_chinese/len_line chinese = self.rule_chinese.findall(self.line_2) len_line = len(self.line_1) + 0.0001 len_chinese = len(chinese) rate_2 = len_chinese/len_line if rate_1<0.7 and (rate_0<0.7 or rate_2<0.7): yield '\n' if self.rule_symbol.search(self.line_1): yield self.line_1+'\n' elif self.rule_symbol.search(self.line_0) and self.rule_symbol.search(self.line_2): yield self.line_1+'\n' else: yield '\n' class loadPara(object): def __init__(self): self.preLine = '' def load(self, lines): res = '' for line in lines: if line == '\n' and self.preLine != '\n': yield res if line == '\n': res = '' else: res += line def readLineContainingChinese(filename): grop = re.compile('[\u4e00-\u9fa5]') with gzip.open(filename, 'rt') as f: fore_line = '' while True: line = f.readline() if line =='\n' and fore_line != '\n': yield line fore_line=line res='' all_str=res.join(grop.findall(line)) if line: rate=len(all_str)/len(line) if rate>0.1: yield line if not line: break def loadSensitiveWord(path): with open(path, encoding="utf-8") as f: vocabs = f.read().splitlines() return vocabs def filter_by_trie(text, trie_model, threhold=3): """ 0:dirty 1:clean """ res = trie_model.find_one(text) # print(res) if len(res) > threhold: label=0 return label,res elif len(res) >= 1 and len(text) < 200: label=0 return label,res else: label=1 return label,res def cleanSensitiveWord(trie_model, paras): for para in paras: para = para.strip() if para: label_trie,match_result = filter_by_trie(para, trie_model, 3) if label_trie: yield para + "\n" def clean(fileName,saveDir, sensitiveFile="sensitive_words-v2.txt"): lines = readLineContainingChinese(fileName) linesCleaned = cleanLine().clean(lines) paras = loadPara().load(linesCleaned) parasCleaned = cleanPara().clean(paras) dirty_vocab = loadSensitiveWord(sensitiveFile) trie_model = Trie_tree() trie_model.load_vocab(dirty_vocab) parasCleaned = cleanSensitiveWord(trie_model, parasCleaned) with open(saveDir,'w',encoding='utf-8') as fw: for para in parasCleaned: fw.write(para) fw.write('\n') if __name__ == '__main__': fileName = '/gdata/commonCrawl/common-crawl-WET-20201124-v1-ori/CC-MAIN-2019-04/CC-MAIN-20190116093643-20190116115643-00025.warc.wet.gz' saveDir = '/ghome/yands/dataset/tmp2.txt' sensitiveFile="sensitive_words-v2.txt" clean(fileName, saveDir, sensitiveFile)