| @@ -8,13 +8,14 @@ WideDeep model jointly trained wide linear models and deep neural network, which | |||||
| - Install [MindSpore](https://www.mindspore.cn/install/en). | - Install [MindSpore](https://www.mindspore.cn/install/en). | ||||
| - Download the dataset and convert the dataset to mindrecord, command as follows: | |||||
| - Place the raw dataset under a certain path, such as: ./recommendation_dataset/origin_data, if you use [criteo dataset](https://s3-eu-west-1.amazonaws.com/kaggle-display-advertising-challenge-dataset/dac.tar.gz), please downlowd the dataset and unzip it to ./recommendation_dataset/origin_data. | |||||
| - Convert the dataset to mindrecord, command as follows: | |||||
| ``` | ``` | ||||
| python src/preprocess_data.py --dense_dim=13 --slot_dim=26 --threshold=100 --train_line_count=45840617 --skip_id_convert=0 | |||||
| python src/preprocess_data.py --data_path=./recommendation_dataset --dense_dim=13 --slot_dim=26 --threshold=100 --train_line_count=45840617 --skip_id_convert=0 | |||||
| ``` | ``` | ||||
| Arguments: | Arguments: | ||||
| * `--data_type` {criteo,synthetic}: Currently we support criteo dataset and synthetic dataset.(Default: ./criteo_data/). | |||||
| * `--data_path` : The path of the data file. | * `--data_path` : The path of the data file. | ||||
| * `--dense_dim` : The number of your continues fields. | * `--dense_dim` : The number of your continues fields. | ||||
| * `--slot_dim` : The number of your sparse fields, it can also be called category features. | * `--slot_dim` : The number of your sparse fields, it can also be called category features. | ||||
| @@ -17,8 +17,6 @@ import os | |||||
| import pickle | import pickle | ||||
| import collections | import collections | ||||
| import argparse | import argparse | ||||
| import urllib.request | |||||
| import tarfile | |||||
| import numpy as np | import numpy as np | ||||
| from mindspore.mindrecord import FileWriter | from mindspore.mindrecord import FileWriter | ||||
| @@ -140,7 +138,7 @@ def mkdir_path(file_path): | |||||
| os.makedirs(file_path) | os.makedirs(file_path) | ||||
| def statsdata(file_path, dict_output_path, criteo_stats_dict, dense_dim=13, slot_dim=26): | |||||
| def statsdata(file_path, dict_output_path, recommendation_dataset_stats_dict, dense_dim=13, slot_dim=26): | |||||
| """Preprocess data and save data""" | """Preprocess data and save data""" | ||||
| with open(file_path, encoding="utf-8") as file_in: | with open(file_path, encoding="utf-8") as file_in: | ||||
| errorline_list = [] | errorline_list = [] | ||||
| @@ -161,13 +159,13 @@ def statsdata(file_path, dict_output_path, criteo_stats_dict, dense_dim=13, slot | |||||
| assert len(values) == dense_dim, "values.size: {}".format(len(values)) | assert len(values) == dense_dim, "values.size: {}".format(len(values)) | ||||
| assert len(cats) == slot_dim, "cats.size: {}".format(len(cats)) | assert len(cats) == slot_dim, "cats.size: {}".format(len(cats)) | ||||
| criteo_stats_dict.stats_vals(values) | |||||
| criteo_stats_dict.stats_cats(cats) | |||||
| criteo_stats_dict.save_dict(dict_output_path) | |||||
| recommendation_dataset_stats_dict.stats_vals(values) | |||||
| recommendation_dataset_stats_dict.stats_cats(cats) | |||||
| recommendation_dataset_stats_dict.save_dict(dict_output_path) | |||||
| def random_split_trans2mindrecord(input_file_path, output_file_path, criteo_stats_dict, part_rows=2000000, | |||||
| line_per_sample=1000, train_line_count=None, | |||||
| def random_split_trans2mindrecord(input_file_path, output_file_path, recommendation_dataset_stats_dict, | |||||
| part_rows=2000000, line_per_sample=1000, train_line_count=None, | |||||
| test_size=0.1, seed=2020, dense_dim=13, slot_dim=26): | test_size=0.1, seed=2020, dense_dim=13, slot_dim=26): | ||||
| """Random split data and save mindrecord""" | """Random split data and save mindrecord""" | ||||
| if train_line_count is None: | if train_line_count is None: | ||||
| @@ -216,7 +214,7 @@ def random_split_trans2mindrecord(input_file_path, output_file_path, criteo_stat | |||||
| assert len(values) == dense_dim, "values.size: {}".format(len(values)) | assert len(values) == dense_dim, "values.size: {}".format(len(values)) | ||||
| assert len(cats) == slot_dim, "cats.size: {}".format(len(cats)) | assert len(cats) == slot_dim, "cats.size: {}".format(len(cats)) | ||||
| ids, wts = criteo_stats_dict.map_cat2id(values, cats) | |||||
| ids, wts = recommendation_dataset_stats_dict.map_cat2id(values, cats) | |||||
| ids_list.extend(ids) | ids_list.extend(ids) | ||||
| wts_list.extend(wts) | wts_list.extend(wts) | ||||
| @@ -261,10 +259,8 @@ def random_split_trans2mindrecord(input_file_path, output_file_path, criteo_stat | |||||
| if __name__ == '__main__': | if __name__ == '__main__': | ||||
| parser = argparse.ArgumentParser(description="criteo data") | |||||
| parser.add_argument("--data_type", type=str, default='criteo', choices=['criteo', 'synthetic'], | |||||
| help='Currently we support criteo dataset and synthetic dataset') | |||||
| parser.add_argument("--data_path", type=str, default="./criteo_data/", help='The path of the data file') | |||||
| parser = argparse.ArgumentParser(description="Recommendation dataset") | |||||
| parser.add_argument("--data_path", type=str, default="./recommendation_dataset/", help='The path of the data file') | |||||
| parser.add_argument("--dense_dim", type=int, default=13, help='The number of your continues fields') | parser.add_argument("--dense_dim", type=int, default=13, help='The number of your continues fields') | ||||
| parser.add_argument("--slot_dim", type=int, default=26, | parser.add_argument("--slot_dim", type=int, default=26, | ||||
| help='The number of your sparse fields, it can also be called catelogy features.') | help='The number of your sparse fields, it can also be called catelogy features.') | ||||
| @@ -277,19 +273,6 @@ if __name__ == '__main__': | |||||
| args, _ = parser.parse_known_args() | args, _ = parser.parse_known_args() | ||||
| data_path = args.data_path | data_path = args.data_path | ||||
| if args.data_type == 'criteo': | |||||
| download_data_path = data_path + "origin_data/" | |||||
| mkdir_path(download_data_path) | |||||
| url = "https://s3-eu-west-1.amazonaws.com/kaggle-display-advertising-challenge-dataset/dac.tar.gz" | |||||
| file_name = download_data_path + '/' + url.split('/')[-1] | |||||
| urllib.request.urlretrieve(url, filename=file_name) | |||||
| tar = tarfile.open(file_name) | |||||
| names = tar.getnames() | |||||
| for name in names: | |||||
| tar.extract(name, path=download_data_path) | |||||
| tar.close() | |||||
| target_field_size = args.dense_dim + args.slot_dim | target_field_size = args.dense_dim + args.slot_dim | ||||
| stats = StatsDict(field_size=target_field_size, dense_dim=args.dense_dim, slot_dim=args.slot_dim, | stats = StatsDict(field_size=target_field_size, dense_dim=args.dense_dim, slot_dim=args.slot_dim, | ||||
| skip_id_convert=args.skip_id_convert) | skip_id_convert=args.skip_id_convert) | ||||
| @@ -13,7 +13,7 @@ | |||||
| # limitations under the License. | # limitations under the License. | ||||
| # ============================================================================ | # ============================================================================ | ||||
| """ | """ | ||||
| Criteo data process | |||||
| Recommendation dataset process | |||||
| """ | """ | ||||
| import os | import os | ||||
| @@ -27,7 +27,7 @@ import pandas as pd | |||||
| TRAIN_LINE_COUNT = 45840617 | TRAIN_LINE_COUNT = 45840617 | ||||
| TEST_LINE_COUNT = 6042135 | TEST_LINE_COUNT = 6042135 | ||||
| class CriteoStatsDict(): | |||||
| class RecommendationDatasetStatsDict(): | |||||
| """create data dict""" | """create data dict""" | ||||
| def __init__(self): | def __init__(self): | ||||
| self.field_size = 39 # value_1-13; cat_1-26; | self.field_size = 39 # value_1-13; cat_1-26; | ||||
| @@ -135,7 +135,7 @@ def mkdir_path(file_path): | |||||
| os.makedirs(file_path) | os.makedirs(file_path) | ||||
| # | # | ||||
| def statsdata(data_file_path, output_path, criteo_stats): | |||||
| def statsdata(data_file_path, output_path, recommendation_dataset_stats): | |||||
| """data status""" | """data status""" | ||||
| with open(data_file_path, encoding="utf-8") as file_in: | with open(data_file_path, encoding="utf-8") as file_in: | ||||
| errorline_list = [] | errorline_list = [] | ||||
| @@ -157,9 +157,9 @@ def statsdata(data_file_path, output_path, criteo_stats): | |||||
| cats = items[14:] | cats = items[14:] | ||||
| assert len(values) == 13, "value.size: {}".format(len(values)) | assert len(values) == 13, "value.size: {}".format(len(values)) | ||||
| assert len(cats) == 26, "cat.size: {}".format(len(cats)) | assert len(cats) == 26, "cat.size: {}".format(len(cats)) | ||||
| criteo_stats.stats_vals(values) | |||||
| criteo_stats.stats_cats(cats) | |||||
| criteo_stats.save_dict(output_path) | |||||
| recommendation_dataset_stats.stats_vals(values) | |||||
| recommendation_dataset_stats.stats_cats(cats) | |||||
| recommendation_dataset_stats.save_dict(output_path) | |||||
| # | # | ||||
| @@ -169,7 +169,8 @@ def add_write(file_path, wr_str): | |||||
| # | # | ||||
| def random_split_trans2h5(in_file_path, output_path, criteo_stats, part_rows=2000000, test_size=0.1, seed=2020): | |||||
| def random_split_trans2h5(in_file_path, output_path, recommendation_dataset_stats, | |||||
| part_rows=2000000, test_size=0.1, seed=2020): | |||||
| """random split trans2h5""" | """random split trans2h5""" | ||||
| test_size = int(TRAIN_LINE_COUNT * test_size) | test_size = int(TRAIN_LINE_COUNT * test_size) | ||||
| # train_size = TRAIN_LINE_COUNT - test_size | # train_size = TRAIN_LINE_COUNT - test_size | ||||
| @@ -207,7 +208,7 @@ def random_split_trans2h5(in_file_path, output_path, criteo_stats, part_rows=200 | |||||
| cats = items[14:] | cats = items[14:] | ||||
| assert len(values) == 13, "value.size: {}".format(len(values)) | assert len(values) == 13, "value.size: {}".format(len(values)) | ||||
| assert len(cats) == 26, "cat.size: {}".format(len(cats)) | assert len(cats) == 26, "cat.size: {}".format(len(cats)) | ||||
| ids, wts = criteo_stats.map_cat2id(values, cats) | |||||
| ids, wts = recommendation_dataset_stats.map_cat2id(values, cats) | |||||
| if i not in test_indices_set: | if i not in test_indices_set: | ||||
| train_feature_list.append(ids + wts) | train_feature_list.append(ids + wts) | ||||
| train_label_list.append(label) | train_label_list.append(label) | ||||
| @@ -253,16 +254,17 @@ if __name__ == "__main__": | |||||
| help="The path to save dataset") | help="The path to save dataset") | ||||
| args, _ = parser.parse_known_args() | args, _ = parser.parse_known_args() | ||||
| base_path = args.raw_data_path | base_path = args.raw_data_path | ||||
| criteo_stat = CriteoStatsDict() | |||||
| recommendation_dataset_stat = RecommendationDatasetStatsDict() | |||||
| # step 1, stats the vocab and normalize value | # step 1, stats the vocab and normalize value | ||||
| datafile_path = base_path + "train_small.txt" | datafile_path = base_path + "train_small.txt" | ||||
| stats_out_path = base_path + "stats_dict/" | stats_out_path = base_path + "stats_dict/" | ||||
| mkdir_path(stats_out_path) | mkdir_path(stats_out_path) | ||||
| statsdata(datafile_path, stats_out_path, criteo_stat) | |||||
| statsdata(datafile_path, stats_out_path, recommendation_dataset_stat) | |||||
| print("------" * 10) | print("------" * 10) | ||||
| criteo_stat.load_dict(dict_path=stats_out_path, prefix="") | |||||
| criteo_stat.get_cat2id(threshold=100) | |||||
| recommendation_dataset_stat.load_dict(dict_path=stats_out_path, prefix="") | |||||
| recommendation_dataset_stat.get_cat2id(threshold=100) | |||||
| # step 2, transform data trans2h5; version 2: np.random.shuffle | # step 2, transform data trans2h5; version 2: np.random.shuffle | ||||
| infile_path = base_path + "train_small.txt" | infile_path = base_path + "train_small.txt" | ||||
| mkdir_path(args.output_path) | mkdir_path(args.output_path) | ||||
| random_split_trans2h5(infile_path, args.output_path, criteo_stat, part_rows=2000000, test_size=0.1, seed=2020) | |||||
| random_split_trans2h5(infile_path, args.output_path, recommendation_dataset_stat, | |||||
| part_rows=2000000, test_size=0.1, seed=2020) | |||||