You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

Second_code.py 9.7 kB

5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308
  1. import jieba
  2. from collections import Counter
  3. import copy
  4. import pandas as pd
  5. #一.中介关键词
  6. def file(seedword,filename):
  7. init_data = open('./result.train','r',encoding='utf-8')
  8. result_data = open(filename,'w',encoding='utf-8')
  9. for line in init_data:
  10. if seedword in line:
  11. result_data.write(line)
  12. init_data.close()
  13. result_data.close()
  14. seedwords=['图片','手机','小说','视频','下载','qq','电影','百度','英语','游戏']
  15. for seedword in seedwords:
  16. filename = './seedword_'+seedword
  17. file(seedword,filename)
  18. def seedwords_segmentation(seedword):
  19. file = './seedword_'+seedword
  20. segmentation_file = './seedword_segmentation_'+seedword
  21. seedwords_data = open(file,'r',encoding='utf-8')
  22. segmentation_data = open(segmentation_file,'w',encoding='utf-8')
  23. for x in seedwords_data:
  24. x = x[:-1]
  25. word_seg = jieba.cut(x)
  26. line_string = '\t'.join(word_seg)+'\n'
  27. segmentation_data.write(line_string)
  28. seedwords_data.close()
  29. segmentation_data.close()
  30. for seedword in seedwords:
  31. seedwords_segmentation(seedword)
  32. stop_word=set(['我','都','与','你','多少','年','月','什么','及',
  33. '和','上','如何','是','吗','有','在','可以','什么','用','了','说',
  34. '大','很','版','之','为什么','怎么办','能','玩','级','写',',',' ','的','怎么'])
  35. def stop_words_filter(words):
  36. word_cleaned = []
  37. for word in words:
  38. if word not in stop_word:
  39. word_cleaned.append(word)
  40. return word_cleaned
  41. def getwordlist(seedword):
  42. wordlist = [];
  43. segmentation_file = './seedword_segmentation_'+seedword
  44. seedwords_data = open(segmentation_file,'r',encoding='utf-8')
  45. for line in seedwords_data:
  46. line = line[:-1]
  47. words = line.split('\t')
  48. wordlist.extend(words)
  49. seedwords_data.close()
  50. return wordlist
  51. def word_frequency_count(seedword):
  52. word_list = getwordlist(seedword)
  53. word_list = stop_words_filter(word_list)
  54. count_result = Counter(word_list)
  55. midkeywords=[]
  56. flag = False
  57. for key, val in count_result.most_common(12):
  58. if flag==True:
  59. midkeywords.append(key)
  60. flag=True
  61. return midkeywords
  62. #得到种子关键词的候选中介关键词集合
  63. midkeywords_list = []
  64. for seedword in seedwords:
  65. midkeywords_list.append(word_frequency_count(seedword))
  66. #print("midkeywords_list")
  67. print(midkeywords_list)
  68. #midkeywords_list[0].remove('图片')
  69. #midkeywords_list[1].remove('手机')
  70. #midkeywords_list[2].remove('小说')
  71. #midkeywords_list[3].remove('视频')
  72. #midkeywords_list[4].remove('下载')
  73. #midkeywords_list[5].remove('qq')
  74. #midkeywords_list[6].remove('电影')
  75. #midkeywords_list[7].remove('百度')
  76. #midkeywords_list[8].remove('英语')
  77. #midkeywords_list[9].remove('游戏')
  78. def seedword_query_volume(seedword):
  79. word_list = getwordlist(seedword)
  80. count_result = Counter(word_list)
  81. for key, val in count_result.most_common(1):
  82. return val
  83. s_query_volume = {}
  84. for seedword in seedwords:
  85. s_query_volume[seedword] = seedword_query_volume(seedword)
  86. # 得到10个种子关键词对应中介关键词,存在midkeywords这个字典中
  87. midkeywords=dict(zip(seedwords,midkeywords_list))
  88. midkeywords
  89. #二. 搜索量统计
  90. def query_volume_count(seedword):
  91. count_list = []
  92. filename = './seedword_'+seedword
  93. for midkeyword in midkeywords[seedword]:
  94. count = 0
  95. init_data = open(filename,'r',encoding = 'utf-8')
  96. for setence in init_data:
  97. if seedword in setence and midkeyword in setence:
  98. count+=1
  99. count_list.append(count)
  100. init_data.close()
  101. return count_list
  102. # 统计得到的搜索量sa,存储在字典sa_query_volume中
  103. sa_query_volume = {}
  104. for seedword in seedwords:
  105. sa_count_list=[]
  106. sa_count_list = query_volume_count(seedword)
  107. sa_count_dict = dict(zip(midkeywords[seedword],sa_count_list))
  108. sa_query_volume[seedword]=sa_count_dict
  109. sa_query_volume
  110. #三. 确定中介关键字的权重
  111. def count_midkeyword_weight():
  112. midkeyword_weight={}
  113. for seedword in seedwords:
  114. s = s_query_volume[seedword]
  115. weight_dict = {}
  116. for midkeyword in sa_query_volume[seedword]:
  117. sa = sa_query_volume[seedword][midkeyword]
  118. weight = sa/s
  119. weight_dict[midkeyword] = weight
  120. midkeyword_weight[seedword] = weight_dict
  121. return midkeyword_weight
  122. # 将中介关键词的权重结果存储在字典w_midkeyword中
  123. w_midkeyword = dict(count_midkeyword_weight())
  124. w_midkeyword
  125. #四.竞争性关键字集合
  126. def comkey_words_file(seedword):
  127. init_data = open('./words2.train','r',encoding='utf-8')
  128. filename = './comkey_'+seedword
  129. result_data = open(filename,'w',encoding='utf-8')
  130. for line in init_data:
  131. if seedword not in line:
  132. for midkeyword in midkeywords[seedword]:
  133. if midkeyword in line:
  134. result_data.write(line)
  135. break
  136. init_data.close()
  137. result_data.close()
  138. for seedword in seedwords:
  139. comkey_words_file(seedword)
  140. def get_comkey_words(seedword):
  141. for midkeyword in midkeywords[seedword]:
  142. comkey_data = open('./comkey_'+seedword,'r',encoding='utf-8')
  143. comkeyquery_list = []
  144. for line in comkey_data:
  145. if midkeyword in line:
  146. line = line[:-1]
  147. word_seg = jieba.cut(line)
  148. comkeyquery_list.extend(word_seg)
  149. comkeyquery_list=stop_words_filter(comkeyquery_list)
  150. count_result = Counter(comkeyquery_list)
  151. for key, val in count_result.most_common(3):
  152. if key!=midkeyword:
  153. print(key,val,end=' ')
  154. comkey_data.close()
  155. for seedword in seedwords:
  156. print(seedword+':\n')
  157. get_comkey_words(seedword)
  158. # 竞争性关键词筛选
  159. # 同时得到|{ka}|的值
  160. ka_list=[[12907,2078,1495,8434,3095,4896,12021,10427],
  161. [34432,10393,7775,3805,5295,1592,2824,1021],
  162. [7135,2128,2852,2050,22595,2227,3792,33049],
  163. [832,17400,17304,31527,11673,1038,1640,29],
  164. [2059,10909,5397,6621,14470,4107,8198,35225],
  165. [11313,4737,2172,16375,2050,898,3270,1967],
  166. [876,66592,1011,18240,7890,18840,11613,31519],
  167. [11794,11417,2717,34264,205,317,1723,2724],
  168. [3351,10414,6072,2133,9202,401,3577,2921],
  169. [34412,11511,31527,967,3883,5253,41,6798]]
  170. comp_list=[['做法','男生','句子','价格表','视频','最新','下载','头像'],
  171. ['txt','头像','苹果7','下载','荣耀','大全','路由器','高清'],
  172. ['穿越火线','电影','前十名','耽美','全文','末世','女主角','txt'],
  173. ['设计','观看','在线','图片','下载','游戏','大全','教学'],
  174. ['重生','qq','电视剧','手机','阅读','手机游戏','穿越','百度云'],
  175. ['下载','女生','重生','查询','侠盗','163邮箱','背景音乐','字'],
  176. ['中国','官网','视频','在线','下载','观看','qq','图片'],
  177. ['青云志','贴吧','人力资源','txt','糗事','视频','360云盘','中国地图'],
  178. ['语文','字','阅读','英文单词','大学生','成绩','英文翻译','视频教程'],
  179. ['txt','qq','图片','单机版','地方','大全','exo','笔记本电脑']]
  180. compkeywords=dict(zip(seedwords,comp_list))
  181. compkeywords
  182. # 每个中介关键词对应的竞争性关键词
  183. compwords=dict.fromkeys(seedwords,{})
  184. i=0
  185. for list in midkeywords_list:
  186. compwords[seedwords[i]]=dict(zip(list,comp_list[i]))
  187. i+=1
  188. # 将|{ka}|的值存在字典ka_query_list中
  189. ka_query_volume=dict.fromkeys(seedwords,{})
  190. i=0
  191. for list in midkeywords_list:
  192. ka_query_volume[seedwords[i]]=dict(zip(list,ka_list[i]))
  193. i+=1
  194. ka_query_volume
  195. #五.计算竞争性关键字的竞争度
  196. def count_midkeyword(seedword):
  197. count_dict=dict.fromkeys(midkeywords[seedword],0)
  198. init_data = open('./words2.train','r',encoding = 'utf-8')
  199. for sentence in init_data:
  200. for midkeyword in midkeywords[seedword]:
  201. if midkeyword in sentence:
  202. count_dict[midkeyword]+=1
  203. init_data.close()
  204. return count_dict
  205. a_query_volume={}
  206. for seedword in seedwords:
  207. a_query_volume[seedword]=count_midkeyword(seedword)
  208. def getcomp():
  209. comp_query_volume = copy.deepcopy(ka_query_volume)
  210. for seedword in ka_query_volume:
  211. for midkeyword in ka_query_volume[seedword]:
  212. comp_query_volume[seedword][midkeyword] = ka_query_volume[seedword][midkeyword]/(a_query_volume[seedword][midkeyword]-sa_query_volume[seedword][midkeyword])
  213. return comp_query_volume
  214. result_query_volume=getcomp()
  215. result_query_volume
  216. def comp_result():
  217. result={}
  218. for seedword in result_query_volume:
  219. result.setdefault(seedword,{})
  220. i=0
  221. for midkeyword in result_query_volume[seedword]:
  222. result[seedword][compkeywords[seedword][i]]=w_midkeyword[seedword][midkeyword]*result_query_volume[seedword][midkeyword]
  223. i+=1
  224. return result
  225. comp_k_s=comp_result()
  226. #关键词k与种子关键词s的竞争性程度
  227. comp_k_s
  228. #六.竞争性关键字排序输出
  229. def comp_print(seedword):
  230. resultpd= pd.DataFrame.from_dict(comp_k_s[seedword],orient='index',columns=['竞争度'])
  231. print(resultpd.sort_values(by='竞争度',ascending=False))
  232. comp_print('图片')#图片的竞争性关键字
  233. comp_print('手机')
  234. comp_print('小说')
  235. comp_print('视频')
  236. comp_print('下载')
  237. comp_print('qq')
  238. comp_print('电影')
  239. comp_print('百度')
  240. comp_print('英语')
  241. comp_print('游戏')

竞争性关键字推荐算法设计项目实践

Contributors (1)