diff --git a/src/cc_cleaner.py b/src/cc_cleaner.py index 22d9b927..a0a212cd 100644 --- a/src/cc_cleaner.py +++ b/src/cc_cleaner.py @@ -126,7 +126,7 @@ class loadPara(object): res += line -def readLine(filename): +def readLineContainingChinese(filename): grop = re.compile('[\u4e00-\u9fa5]') with gzip.open(filename, 'rt') as f: fore_line = '' @@ -181,7 +181,7 @@ def cleanSensitiveWord(trie_model, paras): def clean(fileName,saveDir, sensitiveFile="sensitive_words-v2.txt"): - lines = readLine(fileName) + lines = readLineContainingChinese(fileName) linesCleaned = cleanLine().clean(lines) paras = loadPara().load(linesCleaned) parasCleaned = cleanPara().clean(paras)