import jieba from collections import Counter import copy import pandas as pd #一.中介关键词 def file(seedword,filename): init_data = open('./result.train','r',encoding='utf-8') result_data = open(filename,'w',encoding='utf-8') for line in init_data: if seedword in line: result_data.write(line) init_data.close() result_data.close() seedwords=['图片','手机','小说','视频','下载','qq','电影','百度','英语','游戏'] for seedword in seedwords: filename = './seedword_'+seedword file(seedword,filename) def seedwords_segmentation(seedword): file = './seedword_'+seedword segmentation_file = './seedword_segmentation_'+seedword seedwords_data = open(file,'r',encoding='utf-8') segmentation_data = open(segmentation_file,'w',encoding='utf-8') for x in seedwords_data: x = x[:-1] word_seg = jieba.cut(x) line_string = '\t'.join(word_seg)+'\n' segmentation_data.write(line_string) seedwords_data.close() segmentation_data.close() for seedword in seedwords: seedwords_segmentation(seedword) stop_word=set(['我','都','与','你','多少','年','月','什么','及', '和','上','如何','是','吗','有','在','可以','什么','用','了','说', '大','很','版','之','为什么','怎么办','能','玩','级','写',',',' ','的','怎么']) def stop_words_filter(words): word_cleaned = [] for word in words: if word not in stop_word: word_cleaned.append(word) return word_cleaned def getwordlist(seedword): wordlist = []; segmentation_file = './seedword_segmentation_'+seedword seedwords_data = open(segmentation_file,'r',encoding='utf-8') for line in seedwords_data: line = line[:-1] words = line.split('\t') wordlist.extend(words) seedwords_data.close() return wordlist def word_frequency_count(seedword): word_list = getwordlist(seedword) word_list = stop_words_filter(word_list) count_result = Counter(word_list) midkeywords=[] flag = False for key, val in count_result.most_common(12): if flag==True: midkeywords.append(key) flag=True return midkeywords #得到种子关键词的候选中介关键词集合 midkeywords_list = [] for seedword in seedwords: midkeywords_list.append(word_frequency_count(seedword)) #print("midkeywords_list") print(midkeywords_list) #midkeywords_list[0].remove('图片') #midkeywords_list[1].remove('手机') #midkeywords_list[2].remove('小说') #midkeywords_list[3].remove('视频') #midkeywords_list[4].remove('下载') #midkeywords_list[5].remove('qq') #midkeywords_list[6].remove('电影') #midkeywords_list[7].remove('百度') #midkeywords_list[8].remove('英语') #midkeywords_list[9].remove('游戏') def seedword_query_volume(seedword): word_list = getwordlist(seedword) count_result = Counter(word_list) for key, val in count_result.most_common(1): return val s_query_volume = {} for seedword in seedwords: s_query_volume[seedword] = seedword_query_volume(seedword) # 得到10个种子关键词对应中介关键词,存在midkeywords这个字典中 midkeywords=dict(zip(seedwords,midkeywords_list)) midkeywords #二. 搜索量统计 def query_volume_count(seedword): count_list = [] filename = './seedword_'+seedword for midkeyword in midkeywords[seedword]: count = 0 init_data = open(filename,'r',encoding = 'utf-8') for setence in init_data: if seedword in setence and midkeyword in setence: count+=1 count_list.append(count) init_data.close() return count_list # 统计得到的搜索量sa,存储在字典sa_query_volume中 sa_query_volume = {} for seedword in seedwords: sa_count_list=[] sa_count_list = query_volume_count(seedword) sa_count_dict = dict(zip(midkeywords[seedword],sa_count_list)) sa_query_volume[seedword]=sa_count_dict sa_query_volume #三. 确定中介关键字的权重 def count_midkeyword_weight(): midkeyword_weight={} for seedword in seedwords: s = s_query_volume[seedword] weight_dict = {} for midkeyword in sa_query_volume[seedword]: sa = sa_query_volume[seedword][midkeyword] weight = sa/s weight_dict[midkeyword] = weight midkeyword_weight[seedword] = weight_dict return midkeyword_weight # 将中介关键词的权重结果存储在字典w_midkeyword中 w_midkeyword = dict(count_midkeyword_weight()) w_midkeyword #四.竞争性关键字集合 def comkey_words_file(seedword): init_data = open('./words2.train','r',encoding='utf-8') filename = './comkey_'+seedword result_data = open(filename,'w',encoding='utf-8') for line in init_data: if seedword not in line: for midkeyword in midkeywords[seedword]: if midkeyword in line: result_data.write(line) break init_data.close() result_data.close() for seedword in seedwords: comkey_words_file(seedword) def get_comkey_words(seedword): for midkeyword in midkeywords[seedword]: comkey_data = open('./comkey_'+seedword,'r',encoding='utf-8') comkeyquery_list = [] for line in comkey_data: if midkeyword in line: line = line[:-1] word_seg = jieba.cut(line) comkeyquery_list.extend(word_seg) comkeyquery_list=stop_words_filter(comkeyquery_list) count_result = Counter(comkeyquery_list) for key, val in count_result.most_common(3): if key!=midkeyword: print(key,val,end=' ') comkey_data.close() for seedword in seedwords: print(seedword+':\n') get_comkey_words(seedword) # 竞争性关键词筛选 # 同时得到|{ka}|的值 ka_list=[[12907,2078,1495,8434,3095,4896,12021,10427], [34432,10393,7775,3805,5295,1592,2824,1021], [7135,2128,2852,2050,22595,2227,3792,33049], [832,17400,17304,31527,11673,1038,1640,29], [2059,10909,5397,6621,14470,4107,8198,35225], [11313,4737,2172,16375,2050,898,3270,1967], [876,66592,1011,18240,7890,18840,11613,31519], [11794,11417,2717,34264,205,317,1723,2724], [3351,10414,6072,2133,9202,401,3577,2921], [34412,11511,31527,967,3883,5253,41,6798]] comp_list=[['做法','男生','句子','价格表','视频','最新','下载','头像'], ['txt','头像','苹果7','下载','荣耀','大全','路由器','高清'], ['穿越火线','电影','前十名','耽美','全文','末世','女主角','txt'], ['设计','观看','在线','图片','下载','游戏','大全','教学'], ['重生','qq','电视剧','手机','阅读','手机游戏','穿越','百度云'], ['下载','女生','重生','查询','侠盗','163邮箱','背景音乐','字'], ['中国','官网','视频','在线','下载','观看','qq','图片'], ['青云志','贴吧','人力资源','txt','糗事','视频','360云盘','中国地图'], ['语文','字','阅读','英文单词','大学生','成绩','英文翻译','视频教程'], ['txt','qq','图片','单机版','地方','大全','exo','笔记本电脑']] compkeywords=dict(zip(seedwords,comp_list)) compkeywords # 每个中介关键词对应的竞争性关键词 compwords=dict.fromkeys(seedwords,{}) i=0 for list in midkeywords_list: compwords[seedwords[i]]=dict(zip(list,comp_list[i])) i+=1 # 将|{ka}|的值存在字典ka_query_list中 ka_query_volume=dict.fromkeys(seedwords,{}) i=0 for list in midkeywords_list: ka_query_volume[seedwords[i]]=dict(zip(list,ka_list[i])) i+=1 ka_query_volume #五.计算竞争性关键字的竞争度 def count_midkeyword(seedword): count_dict=dict.fromkeys(midkeywords[seedword],0) init_data = open('./words2.train','r',encoding = 'utf-8') for sentence in init_data: for midkeyword in midkeywords[seedword]: if midkeyword in sentence: count_dict[midkeyword]+=1 init_data.close() return count_dict a_query_volume={} for seedword in seedwords: a_query_volume[seedword]=count_midkeyword(seedword) def getcomp(): comp_query_volume = copy.deepcopy(ka_query_volume) for seedword in ka_query_volume: for midkeyword in ka_query_volume[seedword]: comp_query_volume[seedword][midkeyword] = ka_query_volume[seedword][midkeyword]/(a_query_volume[seedword][midkeyword]-sa_query_volume[seedword][midkeyword]) return comp_query_volume result_query_volume=getcomp() result_query_volume def comp_result(): result={} for seedword in result_query_volume: result.setdefault(seedword,{}) i=0 for midkeyword in result_query_volume[seedword]: result[seedword][compkeywords[seedword][i]]=w_midkeyword[seedword][midkeyword]*result_query_volume[seedword][midkeyword] i+=1 return result comp_k_s=comp_result() #关键词k与种子关键词s的竞争性程度 comp_k_s #六.竞争性关键字排序输出 def comp_print(seedword): resultpd= pd.DataFrame.from_dict(comp_k_s[seedword],orient='index',columns=['竞争度']) print(resultpd.sort_values(by='竞争度',ascending=False)) comp_print('图片')#图片的竞争性关键字 comp_print('手机') comp_print('小说') comp_print('视频') comp_print('下载') comp_print('qq') comp_print('电影') comp_print('百度') comp_print('英语') comp_print('游戏')