'''' Tokenize yelp dataset's documents using stanford core nlp ''' import json import os import pickle import nltk from nltk.tokenize import stanford input_filename = 'review.json' # config for stanford core nlp os.environ['JAVAHOME'] = 'D:\\java\\bin\\java.exe' path_to_jar = 'E:\\College\\fudanNLP\\stanford-corenlp-full-2018-02-27\\stanford-corenlp-3.9.1.jar' tokenizer = stanford.CoreNLPTokenizer() in_dirname = 'review' out_dirname = 'reviews' f = open(input_filename, encoding='utf-8') samples = [] j = 0 for i, line in enumerate(f.readlines()): review = json.loads(line) samples.append((review['stars'], review['text'])) if (i + 1) % 5000 == 0: print(i) pickle.dump(samples, open(in_dirname + '/samples%d.pkl' % j, 'wb')) j += 1 samples = [] pickle.dump(samples, open(in_dirname + '/samples%d.pkl' % j, 'wb')) # samples = pickle.load(open(out_dirname + '/samples0.pkl', 'rb')) # print(samples[0]) for fn in os.listdir(in_dirname): print(fn) precessed = [] for stars, text in pickle.load(open(os.path.join(in_dirname, fn), 'rb')): tokens = [] sents = nltk.tokenize.sent_tokenize(text) for s in sents: tokens.append(tokenizer.tokenize(s)) precessed.append((stars, tokens)) # print(tokens) if len(precessed) % 100 == 0: print(len(precessed)) pickle.dump(precessed, open(os.path.join(out_dirname, fn), 'wb'))