You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

main_data_process.py 3.2 kB

5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124
  1. import re
  2. import jieba
  3. from collections import Counter
  4. #1.预处理
  5. path1="./user_tag_query.10W.TRAIN"
  6. path2="./words1.train"
  7. path3="./words2.train"
  8. def pre1(path):
  9. data = open(path1,'r',encoding='gb18030')
  10. output = open('./words1.train','w',encoding='utf-8')
  11. for line in data:
  12. a = line.split('\t')
  13. a = a[4:]
  14. output_line = '\n'.join(a)
  15. output.write(output_line + '\n')
  16. data.close()
  17. output.close()
  18. def pre2(path1,path2):
  19. train_data = open(path2,'r',encoding='utf-8')
  20. result_data = open(path3,'w',encoding='utf-8')
  21. for line in train_data:
  22. word_list = line.split('\t')
  23. pattern = re.compile(r'[:]?http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
  24. if pattern.match(word_list[0]):
  25. continue
  26. line_string = '\t'.join(word_list) + '\n'
  27. result_data.write(line_string)
  28. train_data.close()
  29. result_data.close()
  30. pre1(path1)
  31. pre2(path2,path3)
  32. #2.分词
  33. path4='./words3.train'
  34. train_data = open(path3,'r',encoding='utf-8')
  35. result_data = open(path4,'w',encoding='utf-8')
  36. for sentence in train_data:
  37. sentence = sentence[:-1]
  38. word_seg = jieba.cut(sentence)
  39. line_string = "\t".join(word_seg) + '\n'
  40. result_data.write(line_string)
  41. train_data.close()
  42. result_data.close()
  43. #3.删除停用词
  44. cn_stopwords = "cn_stopwords.txt"
  45. cn_stopwords_dict = open(cn_stopwords, 'r')
  46. cn_stopwords_content = cn_stopwords_dict.read()
  47. cn_stopwords_list = cn_stopwords_content.splitlines()
  48. cn_stopwords_dict.close()
  49. train_data = open(path4,'r',encoding='utf-8')
  50. result_data = open('./word4.train','w',encoding='utf-8')
  51. def cn_stopwords_filter(word_list,stop_words_list):
  52. word_cleaned=[]
  53. stopwords_list = set(stop_words_list)
  54. for word in word_list:
  55. if word not in stop_words_list:
  56. word_cleaned.append(word)
  57. return word_cleaned
  58. for line in train_data:
  59. line = line[:-1]
  60. word_list = line.split('\t')
  61. word_list = cn_stopwords_filter(word_list,stop_words_list)
  62. if len(word_list) == 0:
  63. continue
  64. line_string = "\t".join(word_list) + '\n'
  65. result_data.write(line_string)
  66. train_data.close()
  67. result_data.close()
  68. #4.选取种子关键字
  69. def count_word(filename):
  70. wordlist = [];
  71. data_file = open(filename,'r',encoding='utf-8')
  72. for line in data_file:
  73. line = line[:-1]
  74. words = line.split('\t')
  75. wordlist.extend(words)
  76. data_file.close()
  77. return wordlist
  78. word_list = count_word('./stopwords_data.train')
  79. count_result = Counter(word_list)
  80. for key, val in count_result.most_common(20):
  81. print(key, val)
  82. train_data = open('./word4.train','r',encoding='utf-8')
  83. result_data = open('./result','w',encoding='utf-8')
  84. words_list=['小说','电脑','笔记本','王者','平板','手机']
  85. for line in train_data:
  86. flag = False
  87. for seedword in words_list:
  88. if seedword in line:
  89. flag = True
  90. break
  91. if(flag==True):
  92. result_data.write(line)
  93. train_data.close()
  94. result_data.close()

竞争性关键字推荐算法设计项目实践

Contributors (1)