| @@ -3,6 +3,7 @@ | |||
| DataCollector项目主要介绍NLP预训练模型训练数据集资源、数据清洗过滤方法。 | |||
| <!-- [[1.数据集资源](#数据集资源)] --> | |||
| [[网页数据介绍及清洗过滤方法](#网页数据介绍及清洗过滤方法)] | |||
| - Common Crawl介绍 | |||
| - Common Crawl数据格式 | |||
| <!-- - (2)Common Crawl数据统计 --> | |||
| @@ -16,6 +17,7 @@ | |||
| [[加入鹏程·PanGu-α微信交流群](#微信交流群)] | |||
| <!-- | |||
| # 数据集资源 | |||
| | 序号 | 数据集 | 数据集大小 |数据集说明| | |||
| | :-------------- | :---- | :------------------------------------------------------------ |:-------------------- | | |||
| | 1 | CLUECorpus2020 |100G |来自CLUE官方搜集 | | |||
| @@ -1,118 +0,0 @@ | |||
| import glob | |||
| import re | |||
| import multiprocessing | |||
| import gzip | |||
| import os | |||
| import copy | |||
| def read_gz_file(path): | |||
| if os.path.exists(path): | |||
| with gzip.open(path, 'r') as pf: | |||
| for line in pf: | |||
| yield line | |||
| else: | |||
| print('the path [{}] is not exist!'.format(path)) | |||
| def process_oneFile(path_input, root_path_save,language_uni_code_dict,lang_rule_str): | |||
| total_rule_str=lang_rule_str | |||
| total_grop = re.compile(total_rule_str) | |||
| real_run_language_uni_code_dict=copy.deepcopy(language_uni_code_dict) | |||
| for key in language_uni_code_dict: | |||
| file_path_save = root_path_save + key + "/"+ path_input.split("/")[-2] + "-" + path_input.split("/")[-1] + ".txt" | |||
| if os.path.exists(file_path_save): | |||
| print("file has already cleaned:", file_path_save) | |||
| real_run_language_uni_code_dict.pop(key) | |||
| #open save file | |||
| save_texts_dict={} | |||
| for key in real_run_language_uni_code_dict: | |||
| # print(key) | |||
| texts=[] | |||
| save_texts_dict[key]=texts | |||
| with gzip.open(path_input, 'rt') as f: | |||
| lines = f.readlines() | |||
| for line in lines: | |||
| if line =='\n': | |||
| for key in real_run_language_uni_code_dict: | |||
| temp_text_list = save_texts_dict[key] | |||
| temp_text_list.append(line) | |||
| continue | |||
| if line: | |||
| total_res='' | |||
| total_res_str = total_res.join(total_grop.findall(line)) | |||
| rate = len(total_res_str) / len(line) | |||
| if rate < 0.2: | |||
| continue | |||
| for key,value in real_run_language_uni_code_dict.items(): | |||
| grop = re.compile(value) | |||
| res='' | |||
| all_str=res.join(grop.findall(line)) | |||
| rate=len(all_str)/len(line) | |||
| if rate>0.2 and len(line)>30: | |||
| temp_text_list=save_texts_dict[key] | |||
| temp_text_list.append(line) | |||
| for key in real_run_language_uni_code_dict: | |||
| # print(key) | |||
| file_path_save=root_path_save+key+"/"+path_input.split("/")[-2]+"-"+path_input.split("/")[-1]+".txt" | |||
| if not os.path.exists(root_path_save+key+"/"): | |||
| os.makedirs(root_path_save+key+"/") | |||
| file_w = open(file_path_save, 'w', encoding='utf-8') | |||
| write_texts=save_texts_dict[key] | |||
| fore_line='' | |||
| for index in range(len(write_texts)): | |||
| line=write_texts[index] | |||
| if fore_line=='\n' and line=='\n': | |||
| continue | |||
| file_w.write(line) | |||
| fore_line=line | |||
| file_w.close() | |||
| if __name__ == '__main__': | |||
| #input data path and output path | |||
| paths_list = ["/gdata/commonCrawl/common-crawl-WET-20201124-v2-ori/*/*.warc.wet.gz"] | |||
| output_root_paths = "/gdata/commonCrawl/multi-lingual-ethnic/" | |||
| #different language unicode config | |||
| language_uni_code_dict = {'Tangut': '[\u17000-\u187FF]', | |||
| 'Miao': '[\u16F00-\u16F9F]', | |||
| 'Lisu': '[\uA4D0-\uA4FF]', | |||
| 'Yi': '[\uA000-\uA4CF]', | |||
| 'Devanagari': '[\u0900-\u097F]'} | |||
| total_rule_str = '[\u17000-\u187FF' \ | |||
| '\u16F00-\u16F9F' \ | |||
| '\uA4D0-\uA4FF' \ | |||
| '\uA000-\uA4CF' \ | |||
| '\u0900-\u097F]' | |||
| #get input files | |||
| original_file_paths = [] | |||
| for path in paths_list: | |||
| original_file_paths.extend(list(glob.glob(path))) | |||
| print(path) | |||
| #prepare input file and output file | |||
| all_input_file_paths=[] | |||
| save_file_paths=[] | |||
| file_num=0 | |||
| for file_path in original_file_paths: | |||
| file_num+=1 | |||
| all_input_file_paths.append(file_path) | |||
| save_file_paths.append(output_root_paths) | |||
| print("file num:",file_num) | |||
| #extract different lanuage data | |||
| num_processes = 300 | |||
| pool = multiprocessing.Pool(processes = num_processes) | |||
| for input_file,save_path in zip(all_input_file_paths,save_file_paths): | |||
| pool.apply_async(process_oneFile, (input_file,save_path,)) | |||
| pool.close() | |||
| pool.join() | |||
| @@ -0,0 +1,209 @@ | |||
| import gzip | |||
| import re | |||
| import zhconv | |||
| from trie_tree_match import Trie_tree | |||
| class cleanPara(object): | |||
| def __init__(self): | |||
| self.rule_riwen = re.compile('[\u3040-\u31BF\u31F0-\u31FF]') | |||
| #用','替代两个或以上的'_','~','-' | |||
| self.rule_sub_1 = re.compile(r'(。;,、?\.)[_|~|-|——]{2,}') | |||
| self.rule_sub_2 = re.compile(r'[_|~|-|——]{2,}(。;,、?\.)') | |||
| self.rule_sub_3 = re.compile(r'[_|~|-|——]{2,}') | |||
| #删除过多的 '.' | |||
| self.rule_sub_4 = re.compile(r'[\.]{6,}') | |||
| #把连续出现两次或以上的符号更替为1个 '。,?!@*' | |||
| self.rule_sub_5 = re.compile(r'([。,?!@*]+?)\1+') | |||
| #重复模式匹配,删除连续重复三次或以上的模式,保留一次模式 | |||
| self.rule_sub_6 = re.compile(r'(.+?)\1\1+') | |||
| #去除html: <br><br><b><li><ol><blockquote> | |||
| self.rule_sub_7 = re.compile(r'((<br>)|(<br>)|(<b>)|(<li>)|(<ol>)|(<blockquote>))') | |||
| #删除多个换行符 | |||
| self.rule_sub_8 = re.compile(r'([\n\r]+?)\1+') | |||
| # rule_remove = re.compile(r'([|\[\]【】])(.{1,15})\1(.{1,15})\1(.{1,15})\1(.{1,15})') | |||
| #删除\\n的段落 | |||
| self.rule_remove_1 = re.compile(r'\\n') | |||
| #删除太多空格的段落 | |||
| self.rule_remove_2 = re.compile(r'[^a-zA-Z0-9][ ][^a-zA-Z0-9]') | |||
| #删除太多|的段落 | |||
| self.rule_remove_3 = re.compile(r'\|') | |||
| def clean(self, paras): | |||
| for para in paras: | |||
| para = para.strip() | |||
| if self.rule_riwen.search(para): | |||
| pass | |||
| else: | |||
| para_len = len(para) | |||
| if para_len>200: | |||
| s = para | |||
| #用','替代两个或以上的'_','~','-' | |||
| s = self.rule_sub_1.sub('\1', s) | |||
| s = self.rule_sub_2.sub('\1', s) | |||
| s = self.rule_sub_3.sub(',', s) | |||
| #删除过多的 '.' | |||
| s = self.rule_sub_4.sub('', s) | |||
| #把连续出现两次或以上的符号更替为1个 '。,?!@*' | |||
| s = self.rule_sub_5.sub(r'\1', s) | |||
| #重复模式匹配,删除重复三次或以上的模式,保留一次模式 | |||
| s = self.rule_sub_6.sub(r'\1', s) | |||
| #去除html: <br><br><b><li><ol><blockquote> | |||
| s = self.rule_sub_7.sub('', s) | |||
| if self.rule_remove_1.search(s) or \ | |||
| len(self.rule_remove_2.findall(s))/para_len>0.05 or \ | |||
| len(self.rule_remove_3.findall(s))/para_len>0.05: | |||
| pass | |||
| else: | |||
| yield zhconv.convert(s+'\n', 'zh-cn') | |||
| class cleanLine(object): | |||
| def __init__(self): | |||
| self.rule_symbol = re.compile('[,。;]') | |||
| pattern = '[\u4e00-\u9fa5\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b1234567890]' | |||
| self.rule_chinese = re.compile(pattern) | |||
| self.rule_feff = re.compile('<feff>') | |||
| self.line_0 = '' | |||
| self.line_1 = '' | |||
| self.line_2 = '' | |||
| def clean(self, lines): | |||
| for line in lines: | |||
| self.line_0 = self.line_1 | |||
| self.line_1 = self.line_2 | |||
| self.line_2 = line | |||
| #删除包含<feff>的行 | |||
| if self.rule_feff.findall(self.line_1): | |||
| yield '' | |||
| self.line_1 = self.line_1.strip() | |||
| chinese = self.rule_chinese.findall(self.line_0) | |||
| len_line = len(self.line_1) + 0.0001 | |||
| len_chinese = len(chinese) | |||
| rate_0 = len_chinese/len_line | |||
| chinese = self.rule_chinese.findall(self.line_1) | |||
| len_line = len(self.line_1) + 0.0001 | |||
| len_chinese = len(chinese) | |||
| rate_1 = len_chinese/len_line | |||
| chinese = self.rule_chinese.findall(self.line_2) | |||
| len_line = len(self.line_1) + 0.0001 | |||
| len_chinese = len(chinese) | |||
| rate_2 = len_chinese/len_line | |||
| if rate_1<0.7 and (rate_0<0.7 or rate_2<0.7): | |||
| yield '\n' | |||
| if self.rule_symbol.search(self.line_1): | |||
| yield self.line_1+'\n' | |||
| elif self.rule_symbol.search(self.line_0) and self.rule_symbol.search(self.line_2): | |||
| yield self.line_1+'\n' | |||
| else: | |||
| yield '\n' | |||
| class loadPara(object): | |||
| def __init__(self): | |||
| self.preLine = '' | |||
| def load(self, lines): | |||
| res = '' | |||
| for line in lines: | |||
| if line == '\n' and self.preLine != '\n': | |||
| yield res | |||
| if line == '\n': | |||
| res = '' | |||
| else: | |||
| res += line | |||
| def readLine(filename): | |||
| grop = re.compile('[\u4e00-\u9fa5]') | |||
| with gzip.open(filename, 'rt') as f: | |||
| fore_line = '' | |||
| while True: | |||
| line = f.readline() | |||
| if line =='\n' and fore_line != '\n': | |||
| yield line | |||
| fore_line=line | |||
| res='' | |||
| all_str=res.join(grop.findall(line)) | |||
| if line: | |||
| rate=len(all_str)/len(line) | |||
| if rate>0.1: | |||
| yield line | |||
| if not line: | |||
| break | |||
| def loadSensitiveWord(path): | |||
| with open(path, encoding="utf-8") as f: | |||
| vocabs = f.read().splitlines() | |||
| return vocabs | |||
| def filter_by_trie(text, trie_model, threhold=3): | |||
| """ | |||
| 0:dirty | |||
| 1:clean | |||
| """ | |||
| res = trie_model.find_one(text) | |||
| # print(res) | |||
| if len(res) > threhold: | |||
| label=0 | |||
| return label,res | |||
| elif len(res) >= 1 and len(text) < 200: | |||
| label=0 | |||
| return label,res | |||
| else: | |||
| label=1 | |||
| return label,res | |||
| def cleanSensitiveWord(trie_model, paras): | |||
| for para in paras: | |||
| para = para.strip() | |||
| if para: | |||
| label_trie,match_result = filter_by_trie(para, trie_model, 3) | |||
| if label_trie: | |||
| yield para + "\n" | |||
| def clean(fileName,saveDir, sensitiveFile="sensitive_words-v2.txt"): | |||
| lines = readLine(fileName) | |||
| linesCleaned = cleanLine().clean(lines) | |||
| paras = loadPara().load(linesCleaned) | |||
| parasCleaned = cleanPara().clean(paras) | |||
| dirty_vocab = loadSensitiveWord(sensitiveFile) | |||
| trie_model = Trie_tree() | |||
| trie_model.load_vocab(dirty_vocab) | |||
| parasCleaned = cleanSensitiveWord(trie_model, parasCleaned) | |||
| with open(saveDir,'w',encoding='utf-8') as fw: | |||
| for para in parasCleaned: | |||
| fw.write(para) | |||
| fw.write('\n') | |||
| if __name__ == '__main__': | |||
| fileName = '/gdata/commonCrawl/common-crawl-WET-20201124-v1-ori/CC-MAIN-2019-04/CC-MAIN-20190116093643-20190116115643-00025.warc.wet.gz' | |||
| saveDir = '/ghome/yands/dataset/tmp2.txt' | |||
| sensitiveFile="sensitive_words-v2.txt" | |||
| clean(fileName, saveDir, sensitiveFile) | |||
| @@ -0,0 +1,724 @@ | |||
| 爱女人 | |||
| 爱液 | |||
| 野战 | |||
| 按摩棒 | |||
| 拔出来 | |||
| 夜色聊人 | |||
| 爆草 | |||
| 包二奶 | |||
| 暴干 | |||
| 暴奸 | |||
| 暴乳 | |||
| 爆乳 | |||
| 暴淫 | |||
| 屄 | |||
| 被操 | |||
| 被插 | |||
| 被干 | |||
| 逼奸 | |||
| 仓井空 | |||
| 插暴 | |||
| 操逼 | |||
| 操黑 | |||
| 操烂 | |||
| 肏你 | |||
| 肏死 | |||
| 操死 | |||
| 操我 | |||
| 厕奴 | |||
| 插比 | |||
| 插b | |||
| 插逼 | |||
| 插进 | |||
| 插你 | |||
| 插我 | |||
| 插阴 | |||
| 潮吹 | |||
| 潮喷 | |||
| 成人dv | |||
| 成人电影 | |||
| 成人论坛 | |||
| 成人小说 | |||
| 成人电 | |||
| 成人卡通 | |||
| 成人聊 | |||
| 成人片 | |||
| 成人视 | |||
| 成人图 | |||
| 成人文 | |||
| 成人小 | |||
| 成人色情 | |||
| 成人网站 | |||
| 成人文学 | |||
| 艳情小说 | |||
| 成人游戏 | |||
| 吃精 | |||
| 赤裸 | |||
| 扌由插 | |||
| 抽一插 | |||
| 春药 | |||
| 大波 | |||
| 大力抽送 | |||
| 大乳 | |||
| 荡妇 | |||
| 荡女 | |||
| 盗撮 | |||
| 多人轮 | |||
| 发浪 | |||
| 放尿 | |||
| 肥逼 | |||
| 粉穴 | |||
| 封面女郎 | |||
| 风月大陆 | |||
| 干死你 | |||
| 干穴 | |||
| 肛交 | |||
| 肛门 | |||
| 龟头 | |||
| 裹本 | |||
| 国产av | |||
| 好嫩 | |||
| 豪乳 | |||
| 黑逼 | |||
| 后庭 | |||
| 后穴 | |||
| 虎骑 | |||
| 花花公子 | |||
| 换妻俱乐部 | |||
| 黄片 | |||
| 几吧 | |||
| 鸡吧 | |||
| 鸡巴 | |||
| 鸡奸 | |||
| 寂寞男 | |||
| 寂寞女 | |||
| 妓女 | |||
| 集体淫 | |||
| 奸情 | |||
| 叫床 | |||
| 脚交 | |||
| 金鳞岂是池中物 | |||
| 金麟岂是池中物 | |||
| 精液 | |||
| 就去日 | |||
| 巨屌 | |||
| 菊花洞 | |||
| 菊门 | |||
| 巨奶 | |||
| 巨乳 | |||
| 菊穴 | |||
| 开苞 | |||
| 口爆 | |||
| 口活 | |||
| 口交 | |||
| 口射 | |||
| 口淫 | |||
| 裤袜 | |||
| 狂操 | |||
| 狂插 | |||
| 浪逼 | |||
| 浪妇 | |||
| 浪叫 | |||
| 浪女 | |||
| 狼友 | |||
| 聊性 | |||
| 流淫 | |||
| 铃木麻 | |||
| 凌辱 | |||
| 漏乳 | |||
| 露b | |||
| 乱交 | |||
| 乱伦 | |||
| 轮暴 | |||
| 轮操 | |||
| 轮奸 | |||
| 裸陪 | |||
| 买春 | |||
| 美逼 | |||
| 美少妇 | |||
| 美乳 | |||
| 美腿 | |||
| 美穴 | |||
| 美幼 | |||
| 秘唇 | |||
| 迷奸 | |||
| 密穴 | |||
| 蜜穴 | |||
| 蜜液 | |||
| 摸奶 | |||
| 摸胸 | |||
| 母奸 | |||
| 奈美 | |||
| 奶子 | |||
| 男奴 | |||
| 内射 | |||
| 嫩逼 | |||
| 嫩女 | |||
| 嫩穴 | |||
| 捏弄 | |||
| 女优 | |||
| 炮友 | |||
| 砲友 | |||
| 喷精 | |||
| 屁眼 | |||
| 品香堂 | |||
| 前凸后翘 | |||
| 强jian | |||
| 强暴 | |||
| 强奸处女 | |||
| 情趣用品 | |||
| 情色 | |||
| 拳交 | |||
| 全裸 | |||
| 群交 | |||
| 惹火身材 | |||
| 人妻 | |||
| 人兽 | |||
| 日逼 | |||
| 日烂 | |||
| 肉棒 | |||
| 肉逼 | |||
| 肉唇 | |||
| 肉洞 | |||
| 肉缝 | |||
| 肉棍 | |||
| 肉茎 | |||
| 肉具 | |||
| 揉乳 | |||
| 肉穴 | |||
| 肉欲 | |||
| 乳爆 | |||
| 乳房 | |||
| 乳沟 | |||
| 乳交 | |||
| 乳头 | |||
| 三级片 | |||
| 骚逼 | |||
| 骚比 | |||
| 骚女 | |||
| 骚水 | |||
| 骚穴 | |||
| 色逼 | |||
| 色界 | |||
| 色猫 | |||
| 色盟 | |||
| 色情网站 | |||
| 色区 | |||
| 色色 | |||
| 色诱 | |||
| 色欲 | |||
| 色b | |||
| 少年阿宾 | |||
| 少修正 | |||
| 射爽 | |||
| 射颜 | |||
| 食精 | |||
| 释欲 | |||
| 兽奸 | |||
| 兽交 | |||
| 手淫 | |||
| 兽欲 | |||
| 熟妇 | |||
| 熟母 | |||
| 熟女 | |||
| 爽片 | |||
| 爽死我了 | |||
| 双臀 | |||
| 死逼 | |||
| 丝袜 | |||
| 丝诱 | |||
| 松岛枫 | |||
| 酥痒 | |||
| 汤加丽 | |||
| 套弄 | |||
| 体奸 | |||
| 体位 | |||
| 舔脚 | |||
| 舔阴 | |||
| 调教 | |||
| 偷欢 | |||
| 偷拍 | |||
| 推油 | |||
| 脱内裤 | |||
| 文做 | |||
| 我就色 | |||
| 无码 | |||
| 舞女 | |||
| 无修正 | |||
| 吸精 | |||
| 夏川纯 | |||
| 相奸 | |||
| 小逼 | |||
| 校鸡 | |||
| 小xue | |||
| 写真 | |||
| 性感妖娆 | |||
| 性感诱惑 | |||
| 性虎 | |||
| 性饥渴 | |||
| 性技巧 | |||
| 性交 | |||
| 性奴 | |||
| 性虐 | |||
| 性息 | |||
| 性欲 | |||
| 胸推 | |||
| 穴口 | |||
| 学生妹 | |||
| 穴图 | |||
| 亚情 | |||
| 颜射 | |||
| 阳具 | |||
| 要射了 | |||
| 夜勤病栋 | |||
| 一本道 | |||
| 一夜欢 | |||
| 一夜情 | |||
| 一ye情 | |||
| 阴部 | |||
| 淫虫 | |||
| 阴唇 | |||
| 淫荡 | |||
| 阴道 | |||
| 淫电影 | |||
| 阴阜 | |||
| 淫妇 | |||
| 淫河 | |||
| 阴核 | |||
| 阴户 | |||
| 淫贱 | |||
| 淫叫 | |||
| 淫教师 | |||
| 阴茎 | |||
| 阴精 | |||
| 淫浪 | |||
| 淫媚 | |||
| 淫糜 | |||
| 淫魔 | |||
| 淫母 | |||
| 淫女 | |||
| 淫虐 | |||
| 淫妻 | |||
| 淫情 | |||
| 淫色 | |||
| 淫声浪语 | |||
| 淫兽学园 | |||
| 淫书 | |||
| 淫术炼金士 | |||
| 淫水 | |||
| 淫娃 | |||
| 淫威 | |||
| 淫亵 | |||
| 淫样 | |||
| 淫液 | |||
| 淫照 | |||
| 阴b | |||
| 应召 | |||
| 幼交 | |||
| 幼男 | |||
| 幼女 | |||
| 欲火 | |||
| 欲女 | |||
| 玉女心经 | |||
| 玉蒲团 | |||
| 玉乳 | |||
| 欲仙欲死 | |||
| 玉穴 | |||
| 援交 | |||
| 原味内衣 | |||
| 援助交际 | |||
| 张筱雨 | |||
| 招鸡 | |||
| 招妓 | |||
| 中年美妇 | |||
| 抓胸 | |||
| 自拍 | |||
| 自慰 | |||
| 作爱 | |||
| 18禁 | |||
| 99bb | |||
| a4u | |||
| a4y | |||
| adult | |||
| amateur | |||
| anal | |||
| a片 | |||
| fuck | |||
| gay片 | |||
| g点 | |||
| g片 | |||
| hardcore | |||
| h动画 | |||
| h动漫 | |||
| incest | |||
| porn | |||
| secom | |||
| sexinsex | |||
| sm女王 | |||
| xiao77 | |||
| xing伴侣 | |||
| tokyohot | |||
| yin荡 | |||
| 贱人 | |||
| 装b | |||
| 大sb | |||
| 傻逼 | |||
| 傻b | |||
| 煞逼 | |||
| 煞笔 | |||
| 刹笔 | |||
| 傻比 | |||
| 沙比 | |||
| 欠干 | |||
| 婊子养的 | |||
| 我日你 | |||
| 我操 | |||
| 我草 | |||
| 卧艹 | |||
| 卧槽 | |||
| 爆你菊 | |||
| 艹你 | |||
| cao你 | |||
| 你他妈 | |||
| 真他妈 | |||
| 别他吗 | |||
| 草你吗 | |||
| 草你丫 | |||
| 操你妈 | |||
| 擦你妈 | |||
| 操你娘 | |||
| 操他妈 | |||
| 日你妈 | |||
| 干你妈 | |||
| 干你娘 | |||
| 娘西皮 | |||
| 狗操 | |||
| 狗草 | |||
| 狗杂种 | |||
| 狗日的 | |||
| 操你祖宗 | |||
| 操你全家 | |||
| 操你大爷 | |||
| 妈逼 | |||
| 你麻痹 | |||
| 麻痹的 | |||
| 妈了个逼 | |||
| 马勒 | |||
| 狗娘养 | |||
| 贱比 | |||
| 贱b | |||
| 下贱 | |||
| 死全家 | |||
| 全家死光 | |||
| 全家不得好死 | |||
| 全家死绝 | |||
| 白痴 | |||
| 无耻 | |||
| sb | |||
| 杀b | |||
| 你吗b | |||
| 你妈的 | |||
| 婊子 | |||
| 贱货 | |||
| 人渣 | |||
| 混蛋 | |||
| 媚外 | |||
| 和弦 | |||
| 性伴侣 | |||
| 男公关 | |||
| 火辣 | |||
| 精子 | |||
| 射精 | |||
| 诱奸 | |||
| 强奸 | |||
| 做爱 | |||
| 性爱 | |||
| 发生关系 | |||
| 按摩 | |||
| 快感 | |||
| 处男 | |||
| 猛男 | |||
| 少妇 | |||
| 屌 | |||
| 屁股 | |||
| 下体 | |||
| 内裤 | |||
| 浑圆 | |||
| 咪咪 | |||
| 发情 | |||
| 白嫩 | |||
| 粉嫩 | |||
| 兽性 | |||
| 风骚 | |||
| 呻吟 | |||
| sm | |||
| 阉割 | |||
| 高潮 | |||
| 裸露 | |||
| 不穿 | |||
| 一丝不挂 | |||
| 脱光 | |||
| 干你 | |||
| 干死 | |||
| 我干 | |||
| 裙中性运动 | |||
| 乱奸 | |||
| 乱伦类 | |||
| 乱伦小 | |||
| 伦理大 | |||
| 伦理电影 | |||
| 伦理毛 | |||
| 伦理片 | |||
| 裸聊 | |||
| 裸聊网 | |||
| 裸体写真 | |||
| 裸舞视 | |||
| 裸照 | |||
| 美女裸体 | |||
| 美女写真 | |||
| 美女上门 | |||
| 美艳少妇 | |||
| 妹按摩 | |||
| 妹上门 | |||
| 迷幻药 | |||
| 迷幻藥 | |||
| 迷昏口 | |||
| 迷昏药 | |||
| 迷昏藥 | |||
| 迷魂香 | |||
| 迷魂药 | |||
| 迷魂藥 | |||
| 迷奸粉 | |||
| 迷奸药 | |||
| 迷情粉 | |||
| 迷情水 | |||
| 迷情药 | |||
| 迷药 | |||
| 迷藥 | |||
| 谜奸药 | |||
| 骚妇 | |||
| 骚货 | |||
| 骚浪 | |||
| 骚嘴 | |||
| 色电影 | |||
| 色妹妹 | |||
| 色情表演 | |||
| 色情电影 | |||
| 色情服务 | |||
| 色情图片 | |||
| 色情小说 | |||
| 色情影片 | |||
| 色情片 | |||
| 色视频 | |||
| 色小说 | |||
| 性服务 | |||
| 性福情 | |||
| 性感少 | |||
| 性伙伴 | |||
| 性交视频 | |||
| 性交图片 | |||
| 性奴集中营 | |||
| 阴蒂 | |||
| 阴间来电 | |||
| 阴茎增大 | |||
| 阴茎助勃 | |||
| 阴毛 | |||
| 陰唇 | |||
| 陰道 | |||
| 陰戶 | |||
| 淫荡美女 | |||
| 淫荡视频 | |||
| 淫荡照片 | |||
| 淫乱 | |||
| 淫靡 | |||
| 淫魔舞 | |||
| 淫情女 | |||
| 淫肉 | |||
| 淫騷妹 | |||
| 淫兽 | |||
| 淫兽学 | |||
| 淫穴 | |||
| morphine | |||
| 摇头丸 | |||
| 乖乖粉 | |||
| narcotic | |||
| 麻醉药 | |||
| 精神药品 | |||
| 干逼 | |||
| 你妈逼 | |||
| 草你妈 | |||
| 艹你妈 | |||
| 草泥马 | |||
| 曹尼玛 | |||
| 失身粉 | |||
| 淫荡自慰器 | |||
| 啪啪 | |||
| 射了 | |||
| 自慰棒 | |||
| 肉肏 | |||
| 中出 | |||
| 肏 | |||
| 抽插 | |||
| 抠弄 | |||
| 小穴 | |||
| 制服诱惑 | |||
| 干到尖叫 | |||
| 大色窝 | |||
| 夜夜撸 | |||
| 色播播 | |||
| 同城交友 | |||
| 虐奸 | |||
| 骚屄 | |||
| 成av人 | |||
| 天天色 | |||
| 天天操 | |||
| 狠狠日 | |||
| 大香蕉网 | |||
| 人人操 | |||
| 伊人久久 | |||
| 人人干免费在线视频 | |||
| 传奇私服 | |||
| 热血传奇 | |||
| 传奇私服网站 | |||
| 新开传奇 | |||
| 时时彩 | |||
| 撸撸色 | |||
| 色中色 | |||
| 韩国三级电影 | |||
| 在线a视频 | |||
| 国产模特私拍视频 | |||
| av视频 | |||
| 香港经典三级 | |||
| 毛片免费观看 | |||
| 伊人大香蕉 | |||
| 久久网 | |||
| 日本高清色情 | |||
| 欧美色图 | |||
| 亚洲色综合 | |||
| 伊人香蕉视频 | |||
| 三级在线播放 | |||
| 青青草在线 | |||
| 韩国19禁电影 | |||
| 偷偷鲁手机在线 | |||
| 一起撸 | |||
| 人人干免费在线视频 | |||
| 亚洲国产手机在线无码 | |||
| 欧美三级网站 | |||
| 五月婷婷 | |||
| 韩国理论片 | |||
| 福利片_九九 | |||
| 苍苍影院 | |||
| 变色桃花源完整版 | |||
| 亚洲一区 | |||
| 日本一本二本三区无码 | |||
| 日本一区二区三区 | |||
| 亚洲人成视频 | |||
| 免费人成视频 | |||
| 欧美成 人 | |||
| 男女啪 | |||
| 久久爱 | |||
| 免费人成 | |||
| 天天j | |||
| 久久精品视频 | |||
| 国内偷拍在线精品 | |||
| 亚洲国产在线视频 | |||
| 在线aV | |||
| 自拍偷拍 | |||
| 国内偷拍 | |||
| 欧美另类 | |||
| 亚洲色av | |||
| av成人网 | |||
| 日本av | |||
| 欧美av | |||
| 成人av | |||
| 亚洲av | |||
| av在线 | |||
| av天堂 | |||
| av电影 | |||
| av视频 | |||
| 一级AVA | |||
| 免费AV视频 | |||
| AV视频 | |||
| 免费黄片 | |||
| 天天日 | |||
| 天天啪 | |||
| 日狠狠 | |||
| 国产自拍 | |||
| 深喉 | |||
| 国产AV | |||
| 久久偷拍 | |||
| 一本到2018 | |||
| 快播成人片 | |||
| 亚洲欧美国产 | |||
| 东京热 | |||
| 另类图片 | |||
| 情色五月天 | |||
| 色情五月 | |||
| 伊人在线 | |||
| 亚洲偷拍 | |||
| 国产与偷拍 | |||
| 威尼斯人官网威 | |||
| 日韩无码 | |||
| 无码中字 | |||
| 一级毛片 | |||
| 日本毛片 | |||
| 亚洲毛片 | |||
| 久久不射 | |||
| 色姑娘综合网 | |||
| 天天舔 | |||
| 天天射 | |||
| 婷婷我去也 | |||
| 偷窥女厕所 | |||
| 免费啪视频 | |||
| 亚洲色欲 | |||
| 色老头 | |||
| 妹子自流白浆 | |||
| 国产午夜精华 | |||
| 童颜巨乳 | |||
| SM重口味 | |||
| 美少女 | |||
| 在线视频 | |||
| 欧美极品 | |||
| 日韩无码 | |||
| 日韩有码 | |||
| 极骚萝莉 | |||
| 人妖视频 | |||
| 强奸乱伦 | |||
| 视频三区 | |||
| 绝美少女 | |||
| 国产精品 | |||
| 自拍偷拍 | |||
| 萝莉少女 | |||
| 3P合辑 | |||
| 视频四区 | |||
| 自慰喷水 | |||
| 人妻系列 | |||
| 巨乳系列 | |||
| 处女专栏 | |||
| 无码专区 | |||
| 视频五区 | |||
| 在线视频 | |||
| 亚洲电影 | |||
| 欧美电影 | |||
| 制服丝袜 | |||
| 强奸乱伦 | |||
| 变态另类 | |||
| 经典三级 | |||
| 成人动漫 | |||
| 人妻系列 | |||
| 丝袜诱惑 | |||
| 美臀美颜 | |||
| 日韩无码 | |||
| 巨乳诱惑 | |||
| 颜射口交 | |||
| 激情图区 | |||
| 亚洲色图 | |||
| 欧美色图 | |||
| 乱伦熟女 | |||
| 自拍偷拍 | |||
| 美女诱惑 | |||
| 巨乳诱惑 | |||
| 淫荡人妻 | |||
| 情色文学 | |||
| 人妻交换 | |||
| 家庭乱伦 | |||
| @@ -0,0 +1,91 @@ | |||
| import collections | |||
| class TrieNode: | |||
| def __init__(self): | |||
| self.children = collections.defaultdict(TrieNode) | |||
| self.is_word = False | |||
| def __repr__(self): | |||
| s = '' | |||
| first = True | |||
| for k, v in self.children.items(): | |||
| if first: | |||
| if v.is_word: | |||
| s += '{} -> {}\n'.format(k, v) | |||
| else: | |||
| s += '{} -> {}'.format(k, v) | |||
| first = False | |||
| continue | |||
| if v.is_word: | |||
| s += '{}\n'.format(k) | |||
| else: | |||
| s += '{} -> {}'.format(k, v) | |||
| return s | |||
| class Trie_tree: | |||
| def __init__(self): | |||
| self.root = TrieNode() | |||
| def insert(self, word): | |||
| current = self.root | |||
| for letter in word: | |||
| current = current.children[letter] | |||
| current.is_word = True | |||
| def load_vocab(self,vocabs): | |||
| for v in vocabs: | |||
| self.insert(v) | |||
| def search(self, word): | |||
| current = self.root | |||
| for letter in word: | |||
| current = current.children.get(letter) | |||
| if current is None: | |||
| return False | |||
| return current.is_word | |||
| def starts_with(self, prefix): | |||
| current = self.root | |||
| for letter in prefix: | |||
| current = current.children.get(letter) | |||
| if current is None: | |||
| return False | |||
| return True | |||
| def __repr__(self): | |||
| return repr(self.root).replace('\n\n', '\n').replace('\n\n', '\n') | |||
| def find_one(self, word): | |||
| '''找到第一个匹配的词 | |||
| :param word: str | |||
| :return: 第一个匹配的词 or None | |||
| >>> a = Trie() | |||
| >>> a.insert('感冒') | |||
| >>> a.find_one('我感冒了好难受怎么办') | |||
| '感冒' | |||
| ''' | |||
| res = [] | |||
| for i in range(len(word)): | |||
| c = word[i] | |||
| node = self.root.children.get(c) | |||
| if node: | |||
| for j in range(i + 1, len(word)): | |||
| _c = word[j] | |||
| node = node.children.get(_c) | |||
| if node: | |||
| if node.is_word: | |||
| res.append(word[i:j + 1]) | |||
| else: | |||
| break | |||
| return res | |||
| if __name__ == '__main__': | |||
| a = Trie_tree() | |||
| a.load_vocab(["免费","动作","动作电影"]) | |||
| print(a) | |||
| print(a.find_one('免费看动作电影')) | |||