|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124 |
- import re
- import jieba
- from collections import Counter
-
- #1.预处理
-
- path1="./user_tag_query.10W.TRAIN"
- path2="./words1.train"
- path3="./words2.train"
-
- def pre1(path):
- data = open(path1,'r',encoding='gb18030')
- output = open('./words1.train','w',encoding='utf-8')
- for line in data:
- a = line.split('\t')
- a = a[4:]
- output_line = '\n'.join(a)
- output.write(output_line + '\n')
- data.close()
- output.close()
-
- def pre2(path1,path2):
- train_data = open(path2,'r',encoding='utf-8')
- result_data = open(path3,'w',encoding='utf-8')
-
- for line in train_data:
- word_list = line.split('\t')
- pattern = re.compile(r'[:]?http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
- if pattern.match(word_list[0]):
- continue
- line_string = '\t'.join(word_list) + '\n'
- result_data.write(line_string)
- train_data.close()
- result_data.close()
-
- pre1(path1)
- pre2(path2,path3)
-
-
- #2.分词
-
- path4='./words3.train'
- train_data = open(path3,'r',encoding='utf-8')
- result_data = open(path4,'w',encoding='utf-8')
-
- for sentence in train_data:
- sentence = sentence[:-1]
- word_seg = jieba.cut(sentence)
- line_string = "\t".join(word_seg) + '\n'
- result_data.write(line_string)
- train_data.close()
- result_data.close()
-
-
- #3.删除停用词
- cn_stopwords = "cn_stopwords.txt"
- cn_stopwords_dict = open(cn_stopwords, 'r')
- cn_stopwords_content = cn_stopwords_dict.read()
-
- cn_stopwords_list = cn_stopwords_content.splitlines()
- cn_stopwords_dict.close()
-
- train_data = open(path4,'r',encoding='utf-8')
- result_data = open('./word4.train','w',encoding='utf-8')
-
-
- def cn_stopwords_filter(word_list,stop_words_list):
- word_cleaned=[]
- stopwords_list = set(stop_words_list)
- for word in word_list:
- if word not in stop_words_list:
- word_cleaned.append(word)
- return word_cleaned
-
- for line in train_data:
- line = line[:-1]
- word_list = line.split('\t')
- word_list = cn_stopwords_filter(word_list,stop_words_list)
- if len(word_list) == 0:
- continue
- line_string = "\t".join(word_list) + '\n'
- result_data.write(line_string)
- train_data.close()
- result_data.close()
-
-
- #4.选取种子关键字
- def count_word(filename):
- wordlist = [];
- data_file = open(filename,'r',encoding='utf-8')
- for line in data_file:
- line = line[:-1]
- words = line.split('\t')
- wordlist.extend(words)
- data_file.close()
- return wordlist
-
- word_list = count_word('./stopwords_data.train')
- count_result = Counter(word_list)
- for key, val in count_result.most_common(20):
- print(key, val)
-
-
-
-
- train_data = open('./word4.train','r',encoding='utf-8')
- result_data = open('./result','w',encoding='utf-8')
-
- words_list=['小说','电脑','笔记本','王者','平板','手机']
- for line in train_data:
- flag = False
- for seedword in words_list:
- if seedword in line:
- flag = True
- break
- if(flag==True):
- result_data.write(line)
- train_data.close()
- result_data.close()
-
-
-
-
-
|