Huawei_Technology
/
mindspore

# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""dataset api"""
import os
from itertools import chain
import gensim
import numpy as np

from mindspore.mindrecord import FileWriter
import mindspore.dataset as ds

# preprocess part
def encode_samples(tokenized_samples, word_to_idx):
    """ encode word to index """
    features = []
    for sample in tokenized_samples:
        feature = []
        for token in sample:
            if token in word_to_idx:
                feature.append(word_to_idx[token])
            else:
                feature.append(0)
        features.append(feature)
    return features


def pad_samples(features, maxlen=50, pad=0):
    """ pad all features to the same length """
    padded_features = []
    for feature in features:
        if len(feature) >= maxlen:
            padded_feature = feature[:maxlen]
        else:
            padded_feature = feature
            while len(padded_feature) < maxlen:
                padded_feature.append(pad)
        padded_features.append(padded_feature)
    return padded_features


def read_imdb(path, seg='train'):
    """ read imdb dataset """
    pos_or_neg = ['pos', 'neg']
    data = []
    for label in pos_or_neg:

        f = os.path.join(path, seg, label)
        rf = open(f, 'r')
        for line in rf:
            line = line.strip()
            if label == 'pos':
                data.append([line, 1])
            elif label == 'neg':
                data.append([line, 0])

    return data


def tokenizer(text):
    return [tok.lower() for tok in text.split(' ')]


def collect_weight(glove_path, vocab, word_to_idx, embed_size):
    """ collect weight """
    vocab_size = len(vocab)
    # wvmodel = gensim.models.KeyedVectors.load_word2vec_format(os.path.join(glove_path, 'glove.6B.300d.txt'),
    #                                                           binary=False, encoding='utf-8')
    wvmodel = gensim.models.KeyedVectors.load_word2vec_format(os.path.join(glove_path, \
    'GoogleNews-vectors-negative300.bin'), binary=True)
    weight_np = np.zeros((vocab_size + 1, embed_size)).astype(np.float32)

    idx_to_word = {i + 1: word for i, word in enumerate(vocab)}
    idx_to_word[0] = '<unk>'

    for i in range(len(wvmodel.index2word)):
        try:
            index = word_to_idx[wvmodel.index2word[i]]
        except KeyError:
            continue
        weight_np[index, :] = wvmodel.get_vector(
            idx_to_word[word_to_idx[wvmodel.index2word[i]]])
    return weight_np


def preprocess(data_path, glove_path, embed_size):
    """ preprocess the train and test data """
    train_data = read_imdb(data_path, 'train')
    test_data = read_imdb(data_path, 'test')

    train_tokenized = []
    test_tokenized = []
    for review, _ in train_data:
        train_tokenized.append(tokenizer(review))
    for review, _ in test_data:
        test_tokenized.append(tokenizer(review))

    vocab = set(chain(*train_tokenized))
    vocab_size = len(vocab)
    print("vocab_size: ", vocab_size)

    word_to_idx = {word: i + 1 for i, word in enumerate(vocab)}
    word_to_idx['<unk>'] = 0

    train_features = np.array(pad_samples(encode_samples(train_tokenized, word_to_idx))).astype(np.int32)
    train_labels = np.array([score for _, score in train_data]).astype(np.int32)
    test_features = np.array(pad_samples(encode_samples(test_tokenized, word_to_idx))).astype(np.int32)
    test_labels = np.array([score for _, score in test_data]).astype(np.int32)

    weight_np = collect_weight(glove_path, vocab, word_to_idx, embed_size)
    return train_features, train_labels, test_features, test_labels, weight_np, vocab_size


def get_imdb_data(labels_data, features_data):
    data_list = []
    for i, (label, feature) in enumerate(zip(labels_data, features_data)):
        data_json = {"id": i,
                     "label": int(label),
                     "feature": feature.reshape(-1)}
        data_list.append(data_json)
    return data_list


def convert_to_mindrecord(embed_size, data_path, proprocess_path, glove_path):
    """ convert imdb dataset to mindrecord """

    num_shard = 4
    train_features, train_labels, test_features, test_labels, weight_np, _ = \
        preprocess(data_path, glove_path, embed_size)
    np.savetxt(os.path.join(proprocess_path, 'weight.txt'), weight_np)

    print("train_features.shape:", train_features.shape, "train_labels.shape:", train_labels.shape, "weight_np.shape:",\
    weight_np.shape, "type:", train_labels.dtype)
    # write mindrecord
    schema_json = {"id": {"type": "int32"},
                   "label": {"type": "int32"},
                   "feature": {"type": "int32", "shape": [-1]}}

    writer = FileWriter(os.path.join(proprocess_path, 'aclImdb_train.mindrecord'), num_shard)
    data = get_imdb_data(train_labels, train_features)
    writer.add_schema(schema_json, "nlp_schema")
    writer.add_index(["id", "label"])
    writer.write_raw_data(data)
    writer.commit()

    writer = FileWriter(os.path.join(proprocess_path, 'aclImdb_test.mindrecord'), num_shard)
    data = get_imdb_data(test_labels, test_features)
    writer.add_schema(schema_json, "nlp_schema")
    writer.add_index(["id", "label"])
    writer.write_raw_data(data)
    writer.commit()


def create_dataset(base_path, batch_size, num_epochs, is_train):
    """Create dataset for training."""
    columns_list = ["feature", "label"]
    num_consumer = 4

    if is_train:
        path = os.path.join(base_path, 'aclImdb_train.mindrecord0')
    else:
        path = os.path.join(base_path, 'aclImdb_test.mindrecord0')

    data_set = ds.MindDataset(path, columns_list, num_consumer)
    ds.config.set_seed(1)
    data_set = data_set.shuffle(buffer_size=data_set.get_dataset_size())
    data_set = data_set.batch(batch_size, drop_remainder=True)
    return data_set