|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308 |
- import jieba
- from collections import Counter
- import copy
- import pandas as pd
-
- #一.中介关键词
- def file(seedword,filename):
- init_data = open('./result.train','r',encoding='utf-8')
- result_data = open(filename,'w',encoding='utf-8')
- for line in init_data:
- if seedword in line:
- result_data.write(line)
- init_data.close()
- result_data.close()
-
- seedwords=['图片','手机','小说','视频','下载','qq','电影','百度','英语','游戏']
- for seedword in seedwords:
- filename = './seedword_'+seedword
- file(seedword,filename)
-
-
- def seedwords_segmentation(seedword):
- file = './seedword_'+seedword
- segmentation_file = './seedword_segmentation_'+seedword
- seedwords_data = open(file,'r',encoding='utf-8')
- segmentation_data = open(segmentation_file,'w',encoding='utf-8')
- for x in seedwords_data:
- x = x[:-1]
- word_seg = jieba.cut(x)
- line_string = '\t'.join(word_seg)+'\n'
- segmentation_data.write(line_string)
- seedwords_data.close()
- segmentation_data.close()
-
-
- for seedword in seedwords:
- seedwords_segmentation(seedword)
-
- stop_word=set(['我','都','与','你','多少','年','月','什么','及',
- '和','上','如何','是','吗','有','在','可以','什么','用','了','说',
- '大','很','版','之','为什么','怎么办','能','玩','级','写',',',' ','的','怎么'])
- def stop_words_filter(words):
- word_cleaned = []
- for word in words:
- if word not in stop_word:
- word_cleaned.append(word)
- return word_cleaned
-
- def getwordlist(seedword):
- wordlist = [];
- segmentation_file = './seedword_segmentation_'+seedword
- seedwords_data = open(segmentation_file,'r',encoding='utf-8')
- for line in seedwords_data:
- line = line[:-1]
- words = line.split('\t')
- wordlist.extend(words)
- seedwords_data.close()
- return wordlist
-
-
- def word_frequency_count(seedword):
- word_list = getwordlist(seedword)
- word_list = stop_words_filter(word_list)
- count_result = Counter(word_list)
- midkeywords=[]
- flag = False
- for key, val in count_result.most_common(12):
- if flag==True:
- midkeywords.append(key)
- flag=True
- return midkeywords
-
-
- #得到种子关键词的候选中介关键词集合
- midkeywords_list = []
- for seedword in seedwords:
- midkeywords_list.append(word_frequency_count(seedword))
-
- #print("midkeywords_list")
- print(midkeywords_list)
-
- #midkeywords_list[0].remove('图片')
- #midkeywords_list[1].remove('手机')
- #midkeywords_list[2].remove('小说')
- #midkeywords_list[3].remove('视频')
- #midkeywords_list[4].remove('下载')
- #midkeywords_list[5].remove('qq')
- #midkeywords_list[6].remove('电影')
- #midkeywords_list[7].remove('百度')
- #midkeywords_list[8].remove('英语')
- #midkeywords_list[9].remove('游戏')
-
-
-
- def seedword_query_volume(seedword):
- word_list = getwordlist(seedword)
- count_result = Counter(word_list)
- for key, val in count_result.most_common(1):
- return val
-
- s_query_volume = {}
- for seedword in seedwords:
- s_query_volume[seedword] = seedword_query_volume(seedword)
-
-
-
- # 得到10个种子关键词对应中介关键词,存在midkeywords这个字典中
- midkeywords=dict(zip(seedwords,midkeywords_list))
- midkeywords
-
-
- #二. 搜索量统计
-
- def query_volume_count(seedword):
- count_list = []
- filename = './seedword_'+seedword
- for midkeyword in midkeywords[seedword]:
- count = 0
- init_data = open(filename,'r',encoding = 'utf-8')
- for setence in init_data:
- if seedword in setence and midkeyword in setence:
- count+=1
- count_list.append(count)
- init_data.close()
- return count_list
-
- # 统计得到的搜索量sa,存储在字典sa_query_volume中
- sa_query_volume = {}
- for seedword in seedwords:
- sa_count_list=[]
- sa_count_list = query_volume_count(seedword)
- sa_count_dict = dict(zip(midkeywords[seedword],sa_count_list))
- sa_query_volume[seedword]=sa_count_dict
- sa_query_volume
-
-
- #三. 确定中介关键字的权重
- def count_midkeyword_weight():
- midkeyword_weight={}
- for seedword in seedwords:
- s = s_query_volume[seedword]
- weight_dict = {}
- for midkeyword in sa_query_volume[seedword]:
- sa = sa_query_volume[seedword][midkeyword]
- weight = sa/s
- weight_dict[midkeyword] = weight
- midkeyword_weight[seedword] = weight_dict
- return midkeyword_weight
-
-
-
- # 将中介关键词的权重结果存储在字典w_midkeyword中
- w_midkeyword = dict(count_midkeyword_weight())
- w_midkeyword
-
-
- #四.竞争性关键字集合
-
- def comkey_words_file(seedword):
- init_data = open('./words2.train','r',encoding='utf-8')
- filename = './comkey_'+seedword
- result_data = open(filename,'w',encoding='utf-8')
- for line in init_data:
- if seedword not in line:
- for midkeyword in midkeywords[seedword]:
- if midkeyword in line:
- result_data.write(line)
- break
- init_data.close()
- result_data.close()
-
- for seedword in seedwords:
- comkey_words_file(seedword)
-
-
- def get_comkey_words(seedword):
- for midkeyword in midkeywords[seedword]:
- comkey_data = open('./comkey_'+seedword,'r',encoding='utf-8')
- comkeyquery_list = []
- for line in comkey_data:
- if midkeyword in line:
- line = line[:-1]
- word_seg = jieba.cut(line)
- comkeyquery_list.extend(word_seg)
- comkeyquery_list=stop_words_filter(comkeyquery_list)
- count_result = Counter(comkeyquery_list)
- for key, val in count_result.most_common(3):
- if key!=midkeyword:
- print(key,val,end=' ')
- comkey_data.close()
-
-
-
- for seedword in seedwords:
- print(seedword+':\n')
- get_comkey_words(seedword)
-
-
-
- # 竞争性关键词筛选
- # 同时得到|{ka}|的值
- ka_list=[[12907,2078,1495,8434,3095,4896,12021,10427],
- [34432,10393,7775,3805,5295,1592,2824,1021],
- [7135,2128,2852,2050,22595,2227,3792,33049],
- [832,17400,17304,31527,11673,1038,1640,29],
- [2059,10909,5397,6621,14470,4107,8198,35225],
- [11313,4737,2172,16375,2050,898,3270,1967],
- [876,66592,1011,18240,7890,18840,11613,31519],
- [11794,11417,2717,34264,205,317,1723,2724],
- [3351,10414,6072,2133,9202,401,3577,2921],
- [34412,11511,31527,967,3883,5253,41,6798]]
-
- comp_list=[['做法','男生','句子','价格表','视频','最新','下载','头像'],
- ['txt','头像','苹果7','下载','荣耀','大全','路由器','高清'],
- ['穿越火线','电影','前十名','耽美','全文','末世','女主角','txt'],
- ['设计','观看','在线','图片','下载','游戏','大全','教学'],
- ['重生','qq','电视剧','手机','阅读','手机游戏','穿越','百度云'],
- ['下载','女生','重生','查询','侠盗','163邮箱','背景音乐','字'],
- ['中国','官网','视频','在线','下载','观看','qq','图片'],
- ['青云志','贴吧','人力资源','txt','糗事','视频','360云盘','中国地图'],
- ['语文','字','阅读','英文单词','大学生','成绩','英文翻译','视频教程'],
- ['txt','qq','图片','单机版','地方','大全','exo','笔记本电脑']]
-
-
-
- compkeywords=dict(zip(seedwords,comp_list))
- compkeywords
-
- # 每个中介关键词对应的竞争性关键词
- compwords=dict.fromkeys(seedwords,{})
- i=0
- for list in midkeywords_list:
- compwords[seedwords[i]]=dict(zip(list,comp_list[i]))
- i+=1
-
-
-
- # 将|{ka}|的值存在字典ka_query_list中
- ka_query_volume=dict.fromkeys(seedwords,{})
- i=0
- for list in midkeywords_list:
- ka_query_volume[seedwords[i]]=dict(zip(list,ka_list[i]))
- i+=1
- ka_query_volume
-
-
- #五.计算竞争性关键字的竞争度
-
-
- def count_midkeyword(seedword):
- count_dict=dict.fromkeys(midkeywords[seedword],0)
- init_data = open('./words2.train','r',encoding = 'utf-8')
- for sentence in init_data:
- for midkeyword in midkeywords[seedword]:
- if midkeyword in sentence:
- count_dict[midkeyword]+=1
- init_data.close()
- return count_dict
- a_query_volume={}
- for seedword in seedwords:
- a_query_volume[seedword]=count_midkeyword(seedword)
-
-
-
- def getcomp():
- comp_query_volume = copy.deepcopy(ka_query_volume)
- for seedword in ka_query_volume:
- for midkeyword in ka_query_volume[seedword]:
- comp_query_volume[seedword][midkeyword] = ka_query_volume[seedword][midkeyword]/(a_query_volume[seedword][midkeyword]-sa_query_volume[seedword][midkeyword])
- return comp_query_volume
- result_query_volume=getcomp()
- result_query_volume
-
-
- def comp_result():
- result={}
- for seedword in result_query_volume:
- result.setdefault(seedword,{})
- i=0
- for midkeyword in result_query_volume[seedword]:
- result[seedword][compkeywords[seedword][i]]=w_midkeyword[seedword][midkeyword]*result_query_volume[seedword][midkeyword]
- i+=1
- return result
- comp_k_s=comp_result()
- #关键词k与种子关键词s的竞争性程度
- comp_k_s
-
-
- #六.竞争性关键字排序输出
- def comp_print(seedword):
- resultpd= pd.DataFrame.from_dict(comp_k_s[seedword],orient='index',columns=['竞争度'])
- print(resultpd.sort_values(by='竞争度',ascending=False))
-
- comp_print('图片')#图片的竞争性关键字
- comp_print('手机')
- comp_print('小说')
- comp_print('视频')
- comp_print('下载')
- comp_print('qq')
- comp_print('电影')
- comp_print('百度')
- comp_print('英语')
- comp_print('游戏')
-
-
-
-
-
|