Browse Source

first commit

master
yands 4 years ago
parent
commit
0e13fbccaa
1 changed files with 2 additions and 2 deletions
  1. +2
    -2
      src/cc_cleaner.py

+ 2
- 2
src/cc_cleaner.py View File

@@ -126,7 +126,7 @@ class loadPara(object):
res += line


def readLine(filename):
def readLineContainingChinese(filename):
grop = re.compile('[\u4e00-\u9fa5]')
with gzip.open(filename, 'rt') as f:
fore_line = ''
@@ -181,7 +181,7 @@ def cleanSensitiveWord(trie_model, paras):


def clean(fileName,saveDir, sensitiveFile="sensitive_words-v2.txt"):
lines = readLine(fileName)
lines = readLineContainingChinese(fileName)
linesCleaned = cleanLine().clean(lines)
paras = loadPara().load(linesCleaned)
parasCleaned = cleanPara().clean(paras)


Loading…
Cancel
Save