|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179 |
- # Copyright 2020 Huawei Technologies Co., Ltd
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- # ============================================================================
- """dataset api"""
- import os
- from itertools import chain
- import gensim
- import numpy as np
-
- from mindspore.mindrecord import FileWriter
- import mindspore.dataset as ds
-
- # preprocess part
- def encode_samples(tokenized_samples, word_to_idx):
- """ encode word to index """
- features = []
- for sample in tokenized_samples:
- feature = []
- for token in sample:
- if token in word_to_idx:
- feature.append(word_to_idx[token])
- else:
- feature.append(0)
- features.append(feature)
- return features
-
-
- def pad_samples(features, maxlen=50, pad=0):
- """ pad all features to the same length """
- padded_features = []
- for feature in features:
- if len(feature) >= maxlen:
- padded_feature = feature[:maxlen]
- else:
- padded_feature = feature
- while len(padded_feature) < maxlen:
- padded_feature.append(pad)
- padded_features.append(padded_feature)
- return padded_features
-
-
- def read_imdb(path, seg='train'):
- """ read imdb dataset """
- pos_or_neg = ['pos', 'neg']
- data = []
- for label in pos_or_neg:
-
- f = os.path.join(path, seg, label)
- rf = open(f, 'r')
- for line in rf:
- line = line.strip()
- if label == 'pos':
- data.append([line, 1])
- elif label == 'neg':
- data.append([line, 0])
-
- return data
-
-
- def tokenizer(text):
- return [tok.lower() for tok in text.split(' ')]
-
-
- def collect_weight(glove_path, vocab, word_to_idx, embed_size):
- """ collect weight """
- vocab_size = len(vocab)
- # wvmodel = gensim.models.KeyedVectors.load_word2vec_format(os.path.join(glove_path, 'glove.6B.300d.txt'),
- # binary=False, encoding='utf-8')
- wvmodel = gensim.models.KeyedVectors.load_word2vec_format(os.path.join(glove_path, \
- 'GoogleNews-vectors-negative300.bin'), binary=True)
- weight_np = np.zeros((vocab_size + 1, embed_size)).astype(np.float32)
-
- idx_to_word = {i + 1: word for i, word in enumerate(vocab)}
- idx_to_word[0] = '<unk>'
-
- for i in range(len(wvmodel.index2word)):
- try:
- index = word_to_idx[wvmodel.index2word[i]]
- except KeyError:
- continue
- weight_np[index, :] = wvmodel.get_vector(
- idx_to_word[word_to_idx[wvmodel.index2word[i]]])
- return weight_np
-
-
- def preprocess(data_path, glove_path, embed_size):
- """ preprocess the train and test data """
- train_data = read_imdb(data_path, 'train')
- test_data = read_imdb(data_path, 'test')
-
- train_tokenized = []
- test_tokenized = []
- for review, _ in train_data:
- train_tokenized.append(tokenizer(review))
- for review, _ in test_data:
- test_tokenized.append(tokenizer(review))
-
- vocab = set(chain(*train_tokenized))
- vocab_size = len(vocab)
- print("vocab_size: ", vocab_size)
-
- word_to_idx = {word: i + 1 for i, word in enumerate(vocab)}
- word_to_idx['<unk>'] = 0
-
- train_features = np.array(pad_samples(encode_samples(train_tokenized, word_to_idx))).astype(np.int32)
- train_labels = np.array([score for _, score in train_data]).astype(np.int32)
- test_features = np.array(pad_samples(encode_samples(test_tokenized, word_to_idx))).astype(np.int32)
- test_labels = np.array([score for _, score in test_data]).astype(np.int32)
-
- weight_np = collect_weight(glove_path, vocab, word_to_idx, embed_size)
- return train_features, train_labels, test_features, test_labels, weight_np, vocab_size
-
-
- def get_imdb_data(labels_data, features_data):
- data_list = []
- for i, (label, feature) in enumerate(zip(labels_data, features_data)):
- data_json = {"id": i,
- "label": int(label),
- "feature": feature.reshape(-1)}
- data_list.append(data_json)
- return data_list
-
-
- def convert_to_mindrecord(embed_size, data_path, proprocess_path, glove_path):
- """ convert imdb dataset to mindrecord """
-
- num_shard = 4
- train_features, train_labels, test_features, test_labels, weight_np, _ = \
- preprocess(data_path, glove_path, embed_size)
- np.savetxt(os.path.join(proprocess_path, 'weight.txt'), weight_np)
-
- print("train_features.shape:", train_features.shape, "train_labels.shape:", train_labels.shape, "weight_np.shape:",\
- weight_np.shape, "type:", train_labels.dtype)
- # write mindrecord
- schema_json = {"id": {"type": "int32"},
- "label": {"type": "int32"},
- "feature": {"type": "int32", "shape": [-1]}}
-
- writer = FileWriter(os.path.join(proprocess_path, 'aclImdb_train.mindrecord'), num_shard)
- data = get_imdb_data(train_labels, train_features)
- writer.add_schema(schema_json, "nlp_schema")
- writer.add_index(["id", "label"])
- writer.write_raw_data(data)
- writer.commit()
-
- writer = FileWriter(os.path.join(proprocess_path, 'aclImdb_test.mindrecord'), num_shard)
- data = get_imdb_data(test_labels, test_features)
- writer.add_schema(schema_json, "nlp_schema")
- writer.add_index(["id", "label"])
- writer.write_raw_data(data)
- writer.commit()
-
-
- def create_dataset(base_path, batch_size, num_epochs, is_train):
- """Create dataset for training."""
- columns_list = ["feature", "label"]
- num_consumer = 4
-
- if is_train:
- path = os.path.join(base_path, 'aclImdb_train.mindrecord0')
- else:
- path = os.path.join(base_path, 'aclImdb_test.mindrecord0')
-
- data_set = ds.MindDataset(path, columns_list, num_consumer)
- ds.config.set_seed(1)
- data_set = data_set.shuffle(buffer_size=data_set.get_dataset_size())
- data_set = data_set.batch(batch_size, drop_remainder=True)
- return data_set
|