You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

preprocess.py 1.5 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950
  1. ''''
  2. Tokenize yelp dataset's documents using stanford core nlp
  3. '''
  4. import json
  5. import os
  6. import pickle
  7. import nltk
  8. from nltk.tokenize import stanford
  9. input_filename = 'review.json'
  10. # config for stanford core nlp
  11. os.environ['JAVAHOME'] = 'D:\\java\\bin\\java.exe'
  12. path_to_jar = 'E:\\College\\fudanNLP\\stanford-corenlp-full-2018-02-27\\stanford-corenlp-3.9.1.jar'
  13. tokenizer = stanford.CoreNLPTokenizer()
  14. in_dirname = 'review'
  15. out_dirname = 'reviews'
  16. f = open(input_filename, encoding='utf-8')
  17. samples = []
  18. j = 0
  19. for i, line in enumerate(f.readlines()):
  20. review = json.loads(line)
  21. samples.append((review['stars'], review['text']))
  22. if (i + 1) % 5000 == 0:
  23. print(i)
  24. pickle.dump(samples, open(in_dirname + '/samples%d.pkl' % j, 'wb'))
  25. j += 1
  26. samples = []
  27. pickle.dump(samples, open(in_dirname + '/samples%d.pkl' % j, 'wb'))
  28. # samples = pickle.load(open(out_dirname + '/samples0.pkl', 'rb'))
  29. # print(samples[0])
  30. for fn in os.listdir(in_dirname):
  31. print(fn)
  32. precessed = []
  33. for stars, text in pickle.load(open(os.path.join(in_dirname, fn), 'rb')):
  34. tokens = []
  35. sents = nltk.tokenize.sent_tokenize(text)
  36. for s in sents:
  37. tokens.append(tokenizer.tokenize(s))
  38. precessed.append((stars, tokens))
  39. # print(tokens)
  40. if len(precessed) % 100 == 0:
  41. print(len(precessed))
  42. pickle.dump(precessed, open(os.path.join(out_dirname, fn), 'wb'))

一款轻量级的自然语言处理(NLP)工具包,目标是减少用户项目中的工程型代码,例如数据处理循环、训练循环、多卡运行等