| @@ -126,7 +126,7 @@ class loadPara(object): | |||||
| res += line | res += line | ||||
| def readLine(filename): | |||||
| def readLineContainingChinese(filename): | |||||
| grop = re.compile('[\u4e00-\u9fa5]') | grop = re.compile('[\u4e00-\u9fa5]') | ||||
| with gzip.open(filename, 'rt') as f: | with gzip.open(filename, 'rt') as f: | ||||
| fore_line = '' | fore_line = '' | ||||
| @@ -181,7 +181,7 @@ def cleanSensitiveWord(trie_model, paras): | |||||
| def clean(fileName,saveDir, sensitiveFile="sensitive_words-v2.txt"): | def clean(fileName,saveDir, sensitiveFile="sensitive_words-v2.txt"): | ||||
| lines = readLine(fileName) | |||||
| lines = readLineContainingChinese(fileName) | |||||
| linesCleaned = cleanLine().clean(lines) | linesCleaned = cleanLine().clean(lines) | ||||
| paras = loadPara().load(linesCleaned) | paras = loadPara().load(linesCleaned) | ||||
| parasCleaned = cleanPara().clean(paras) | parasCleaned = cleanPara().clean(paras) | ||||