|
|
|
@@ -126,7 +126,7 @@ class loadPara(object): |
|
|
|
res += line |
|
|
|
|
|
|
|
|
|
|
|
def readLine(filename): |
|
|
|
def readLineContainingChinese(filename): |
|
|
|
grop = re.compile('[\u4e00-\u9fa5]') |
|
|
|
with gzip.open(filename, 'rt') as f: |
|
|
|
fore_line = '' |
|
|
|
@@ -181,7 +181,7 @@ def cleanSensitiveWord(trie_model, paras): |
|
|
|
|
|
|
|
|
|
|
|
def clean(fileName,saveDir, sensitiveFile="sensitive_words-v2.txt"): |
|
|
|
lines = readLine(fileName) |
|
|
|
lines = readLineContainingChinese(fileName) |
|
|
|
linesCleaned = cleanLine().clean(lines) |
|
|
|
paras = loadPara().load(linesCleaned) |
|
|
|
parasCleaned = cleanPara().clean(paras) |
|
|
|
|