Merge pull request !743 from guozhijian/add_perf_test_for_mindrecordtags/v0.3.0-alpha
| @@ -118,5 +118,8 @@ def mindrecord_dict_data(task_id): | |||
| image_file = open(file_name, "rb") | |||
| image_bytes = image_file.read() | |||
| image_file.close() | |||
| if not image_bytes: | |||
| print("The image file: {} is invalid.".format(file_name)) | |||
| continue | |||
| data["data"] = image_bytes | |||
| yield data | |||
| @@ -0,0 +1,32 @@ | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| """use ImageNetToMR tool generate mindrecord""" | |||
| import os | |||
| from mindspore.mindrecord import ImageNetToMR | |||
| IMAGENET_MAP_FILE = "../../../ut/data/mindrecord/testImageNetDataWhole/labels_map.txt" | |||
| IMAGENET_IMAGE_DIR = "../../../ut/data/mindrecord/testImageNetDataWhole/images" | |||
| MINDRECORD_FILE = "./imagenet.mindrecord" | |||
| PARTITION_NUMBER = 16 | |||
| def imagenet_to_mindrecord(): | |||
| imagenet_transformer = ImageNetToMR(IMAGENET_MAP_FILE, | |||
| IMAGENET_IMAGE_DIR, | |||
| MINDRECORD_FILE, | |||
| PARTITION_NUMBER) | |||
| imagenet_transformer.transform() | |||
| if __name__ == '__main__': | |||
| imagenet_to_mindrecord() | |||
| @@ -0,0 +1,113 @@ | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| """generate tfrecord""" | |||
| import collections | |||
| import os | |||
| import tensorflow as tf | |||
| IMAGENET_MAP_FILE = "../../../ut/data/mindrecord/testImageNetDataWhole/labels_map.txt" | |||
| IMAGENET_IMAGE_DIR = "../../../ut/data/mindrecord/testImageNetDataWhole/images" | |||
| TFRECORD_FILE = "./imagenet.tfrecord" | |||
| PARTITION_NUMBER = 16 | |||
| def get_imagenet_filename_label_pic(map_file, image_dir): | |||
| """ | |||
| Get data from imagenet. | |||
| Yields: | |||
| filename, label, image_bytes | |||
| """ | |||
| if not os.path.exists(map_file): | |||
| raise IOError("map file {} not exists".format(map_file)) | |||
| label_dict = {} | |||
| with open(map_file) as fp: | |||
| line = fp.readline() | |||
| while line: | |||
| labels = line.split(" ") | |||
| label_dict[labels[1]] = labels[0] | |||
| line = fp.readline() | |||
| # get all the dir which are n02087046, n02094114, n02109525 | |||
| dir_paths = {} | |||
| for item in label_dict: | |||
| real_path = os.path.join(image_dir, label_dict[item]) | |||
| if not os.path.isdir(real_path): | |||
| print("{} dir is not exist".format(real_path)) | |||
| continue | |||
| dir_paths[item] = real_path | |||
| if not dir_paths: | |||
| raise PathNotExistsError("not valid image dir in {}".format(image_dir)) | |||
| # get the filename, label and image binary as a dict | |||
| for label in dir_paths: | |||
| for item in os.listdir(dir_paths[label]): | |||
| file_name = os.path.join(dir_paths[label], item) | |||
| if not item.endswith("JPEG") and not item.endswith("jpg"): | |||
| print("{} file is not suffix with JPEG/jpg, skip it.".format(file_name)) | |||
| continue | |||
| # get the image data | |||
| image_file = open(file_name, "rb") | |||
| image_bytes = image_file.read() | |||
| image_file.close() | |||
| if not image_bytes: | |||
| print("The image file: {} is invalid.".format(file_name)) | |||
| continue | |||
| yield str(file_name), int(label), image_bytes | |||
| def create_int_feature(values): | |||
| feature = tf.train.Feature(int64_list=tf.train.Int64List(value=[values])) | |||
| return feature | |||
| def create_string_feature(values): | |||
| feature = tf.train.Feature(bytes_list=tf.train.BytesList(value=[bytes(values, encoding='utf-8')])) | |||
| return feature | |||
| def create_bytes_feature(values): | |||
| feature = tf.train.Feature(bytes_list=tf.train.BytesList(value=[values])) | |||
| return feature | |||
| def imagenet_to_tfrecord(): | |||
| writers = [] | |||
| for i in range(PARTITION_NUMBER): | |||
| output_file = TFRECORD_FILE + str(i).rjust(2, '0') | |||
| writers.append(tf.io.TFRecordWriter(output_file)) | |||
| writer_index = 0 | |||
| total_written = 0 | |||
| for file_name, label, image_bytes in get_imagenet_filename_label_pic(IMAGENET_MAP_FILE, | |||
| IMAGENET_IMAGE_DIR): | |||
| features = collections.OrderedDict() | |||
| features["file_name"] = create_string_feature(file_name) | |||
| features["label"] = create_int_feature(label) | |||
| features["data"] = create_bytes_feature(image_bytes) | |||
| tf_example = tf.train.Example(features=tf.train.Features(feature=features)) | |||
| writers[writer_index].write(tf_example.SerializeToString()) | |||
| writer_index = (writer_index + 1) % len(writers) | |||
| total_written += 1 | |||
| for writer in writers: | |||
| writer.close() | |||
| print("Write {} total examples".format(total_written)) | |||
| if __name__ == '__main__': | |||
| imagenet_to_tfrecord() | |||
| @@ -0,0 +1,106 @@ | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| """test dataset performance about mindspore.MindDataset, mindspore.TFRecordDataset, tf.data.TFRecordDataset""" | |||
| import time | |||
| import mindspore.dataset as ds | |||
| from mindspore.mindrecord import FileReader | |||
| import tensorflow as tf | |||
| print_step = 5000 | |||
| def print_log(count): | |||
| if count % print_step == 0: | |||
| print("Read {} rows ...".format(count)) | |||
| def use_filereader(mindrecord): | |||
| start = time.time() | |||
| columns_list = ["data", "label"] | |||
| reader = FileReader(file_name=mindrecord, | |||
| num_consumer=4, | |||
| columns=columns_list) | |||
| num_iter = 0 | |||
| for index, item in enumerate(reader.get_next()): | |||
| num_iter += 1 | |||
| print_log(num_iter) | |||
| end = time.time() | |||
| print("Read by FileReader - total rows: {}, cost time: {}s".format(num_iter, end - start)) | |||
| def use_minddataset(mindrecord): | |||
| start = time.time() | |||
| columns_list = ["data", "label"] | |||
| data_set = ds.MindDataset(dataset_file=mindrecord, | |||
| columns_list=columns_list, | |||
| num_parallel_workers=4) | |||
| num_iter = 0 | |||
| for item in data_set.create_dict_iterator(): | |||
| num_iter += 1 | |||
| print_log(num_iter) | |||
| end = time.time() | |||
| print("Read by MindDataset - total rows: {}, cost time: {}s".format(num_iter, end - start)) | |||
| def use_tfrecorddataset(tfrecord): | |||
| start = time.time() | |||
| columns_list = ["data", "label"] | |||
| data_set = ds.TFRecordDataset(dataset_files=tfrecord, | |||
| columns_list=columns_list, | |||
| num_parallel_workers=4, | |||
| shuffle=ds.Shuffle.GLOBAL) | |||
| data_set = data_set.shuffle(10000) | |||
| num_iter = 0 | |||
| for item in data_set.create_dict_iterator(): | |||
| num_iter += 1 | |||
| print_log(num_iter) | |||
| end = time.time() | |||
| print("Read by TFRecordDataset - total rows: {}, cost time: {}s".format(num_iter, end - start)) | |||
| def use_tensorflow_tfrecorddataset(tfrecord): | |||
| start = time.time() | |||
| def _parse_record(example_photo): | |||
| features = { | |||
| 'file_name': tf.io.FixedLenFeature([], tf.string), | |||
| 'label': tf.io.FixedLenFeature([1], tf.int64), | |||
| 'data': tf.io.FixedLenFeature([], tf.string)} | |||
| parsed_features = tf.io.parse_single_example(example_photo, features=features) | |||
| return parsed_features | |||
| data_set = tf.data.TFRecordDataset(filenames=tfrecord, | |||
| buffer_size=100000, | |||
| num_parallel_reads=4) | |||
| data_set = data_set.map(_parse_record, num_parallel_calls=4) | |||
| num_iter = 0 | |||
| for item in data_set.__iter__(): | |||
| num_iter += 1 | |||
| print_log(num_iter) | |||
| end = time.time() | |||
| print("Read by TensorFlow TFRecordDataset - total rows: {}, cost time: {}s".format(num_iter, end - start)) | |||
| if __name__ == '__main__': | |||
| # use MindDataset | |||
| mindrecord = './imagenet.mindrecord00' | |||
| use_minddataset(mindrecord) | |||
| # use TFRecordDataset | |||
| tfrecord = ['imagenet.tfrecord00', 'imagenet.tfrecord01', 'imagenet.tfrecord02', 'imagenet.tfrecord03', | |||
| 'imagenet.tfrecord04', 'imagenet.tfrecord05', 'imagenet.tfrecord06', 'imagenet.tfrecord07', | |||
| 'imagenet.tfrecord08', 'imagenet.tfrecord09', 'imagenet.tfrecord10', 'imagenet.tfrecord11', | |||
| 'imagenet.tfrecord12', 'imagenet.tfrecord13', 'imagenet.tfrecord14', 'imagenet.tfrecord15'] | |||
| use_tfrecorddataset(tfrecord) | |||
| # use TensorFlow TFRecordDataset | |||
| use_tensorflow_tfrecorddataset(tfrecord) | |||
| # use FileReader | |||
| # use_filereader(mindrecord) | |||
| @@ -0,0 +1,18 @@ | |||
| { | |||
| "datasetType": "TF", | |||
| "numRows": 930059, | |||
| "columns": { | |||
| "file_name": { | |||
| "type": "uint8", | |||
| "rank": 0 | |||
| }, | |||
| "label": { | |||
| "type": "int64", | |||
| "rank": 0 | |||
| }, | |||
| "data": { | |||
| "type": "uint8", | |||
| "rank": 0 | |||
| } | |||
| } | |||
| } | |||