import re import jieba from collections import Counter #1.预处理 path1="./user_tag_query.10W.TRAIN" path2="./words1.train" path3="./words2.train" def pre1(path): data = open(path1,'r',encoding='gb18030') output = open('./words1.train','w',encoding='utf-8') for line in data: a = line.split('\t') a = a[4:] output_line = '\n'.join(a) output.write(output_line + '\n') data.close() output.close() def pre2(path1,path2): train_data = open(path2,'r',encoding='utf-8') result_data = open(path3,'w',encoding='utf-8') for line in train_data: word_list = line.split('\t') pattern = re.compile(r'[:]?http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+') if pattern.match(word_list[0]): continue line_string = '\t'.join(word_list) + '\n' result_data.write(line_string) train_data.close() result_data.close() pre1(path1) pre2(path2,path3) #2.分词 path4='./words3.train' train_data = open(path3,'r',encoding='utf-8') result_data = open(path4,'w',encoding='utf-8') for sentence in train_data: sentence = sentence[:-1] word_seg = jieba.cut(sentence) line_string = "\t".join(word_seg) + '\n' result_data.write(line_string) train_data.close() result_data.close() #3.删除停用词 cn_stopwords = "cn_stopwords.txt" cn_stopwords_dict = open(cn_stopwords, 'r') cn_stopwords_content = cn_stopwords_dict.read() cn_stopwords_list = cn_stopwords_content.splitlines() cn_stopwords_dict.close() train_data = open(path4,'r',encoding='utf-8') result_data = open('./word4.train','w',encoding='utf-8') def cn_stopwords_filter(word_list,stop_words_list): word_cleaned=[] stopwords_list = set(stop_words_list) for word in word_list: if word not in stop_words_list: word_cleaned.append(word) return word_cleaned for line in train_data: line = line[:-1] word_list = line.split('\t') word_list = cn_stopwords_filter(word_list,stop_words_list) if len(word_list) == 0: continue line_string = "\t".join(word_list) + '\n' result_data.write(line_string) train_data.close() result_data.close() #4.选取种子关键字 def count_word(filename): wordlist = []; data_file = open(filename,'r',encoding='utf-8') for line in data_file: line = line[:-1] words = line.split('\t') wordlist.extend(words) data_file.close() return wordlist word_list = count_word('./stopwords_data.train') count_result = Counter(word_list) for key, val in count_result.most_common(20): print(key, val) train_data = open('./word4.train','r',encoding='utf-8') result_data = open('./result','w',encoding='utf-8') words_list=['小说','电脑','笔记本','王者','平板','手机'] for line in train_data: flag = False for seedword in words_list: if seedword in line: flag = True break if(flag==True): result_data.write(line) train_data.close() result_data.close()