From ba1c82e5443791916acfa00c196bae1698034491 Mon Sep 17 00:00:00 2001 From: yands <273471520@qq.com> Date: Fri, 23 Jul 2021 17:40:09 +0800 Subject: [PATCH] first commit --- README.md | 2 + src/.DS_Store | Bin 8196 -> 8196 bytes src/1-FilterLanguage/languageExtract.py | 118 ---- src/cc_cleaner.py | 209 +++++++ src/sensitive_words-v2.txt | 724 ++++++++++++++++++++++++ src/trie_tree_match.py | 91 +++ 6 files changed, 1026 insertions(+), 118 deletions(-) delete mode 100644 src/1-FilterLanguage/languageExtract.py create mode 100644 src/cc_cleaner.py create mode 100755 src/sensitive_words-v2.txt create mode 100755 src/trie_tree_match.py diff --git a/README.md b/README.md index 4bd44ed4..b1232381 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,7 @@ DataCollector项目主要介绍NLP预训练模型训练数据集资源、数据清洗过滤方法。 [[网页数据介绍及清洗过滤方法](#网页数据介绍及清洗过滤方法)] + - Common Crawl介绍 - Common Crawl数据格式 @@ -16,6 +17,7 @@ [[加入鹏程·PanGu-α微信交流群](#微信交流群)]