From 0e13fbccaa98a9487258d22ea380835f48f6bda8 Mon Sep 17 00:00:00 2001 From: yands <273471520@qq.com> Date: Sat, 24 Jul 2021 07:47:37 +0800 Subject: [PATCH] first commit --- src/cc_cleaner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cc_cleaner.py b/src/cc_cleaner.py index 22d9b927..a0a212cd 100644 --- a/src/cc_cleaner.py +++ b/src/cc_cleaner.py @@ -126,7 +126,7 @@ class loadPara(object): res += line -def readLine(filename): +def readLineContainingChinese(filename): grop = re.compile('[\u4e00-\u9fa5]') with gzip.open(filename, 'rt') as f: fore_line = '' @@ -181,7 +181,7 @@ def cleanSensitiveWord(trie_model, paras): def clean(fileName,saveDir, sensitiveFile="sensitive_words-v2.txt"): - lines = readLine(fileName) + lines = readLineContainingChinese(fileName) linesCleaned = cleanLine().clean(lines) paras = loadPara().load(linesCleaned) parasCleaned = cleanPara().clean(paras)