From bf87b1d1fb4ee0eda62a0f3b65ee934239397402 Mon Sep 17 00:00:00 2001 From: qianlong Date: Sat, 25 Jul 2020 18:05:53 +0800 Subject: [PATCH 1/3] fix cache op core dump --- .../dataset/engine/datasetops/dataset_op.cc | 3 ++ tests/ut/python/dataset/test_cache_map.py | 39 ++++++++++++++++--- 2 files changed, 36 insertions(+), 6 deletions(-) diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/dataset_op.cc b/mindspore/ccsrc/minddata/dataset/engine/datasetops/dataset_op.cc index dd53e0527d..0e0db663d0 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/dataset_op.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/dataset_op.cc @@ -392,6 +392,9 @@ uint32_t DatasetOp::GenerateCRC(const std::shared_ptr &op) { ss_str = std::regex_replace(ss_str, std::regex("Num workers.*\n"), ""); ss_str = std::regex_replace(ss_str, std::regex("\\[workers.*\\]"), ""); + // Filter out Number of rows when generating the check sum + ss_str = std::regex_replace(ss_str, std::regex("Number of rows.*\n"), ""); + // Filter out the Operator control flags field when generating the check sum ss_str = std::regex_replace(ss_str, std::regex("Operator control flags.*\n"), ""); diff --git a/tests/ut/python/dataset/test_cache_map.py b/tests/ut/python/dataset/test_cache_map.py index 5de969db6d..72f33be576 100644 --- a/tests/ut/python/dataset/test_cache_map.py +++ b/tests/ut/python/dataset/test_cache_map.py @@ -104,11 +104,11 @@ def test_cache_map_basic3(): decode_op = c_vision.Decode() ds1 = ds1.repeat(4) ds1 = ds1.map(input_columns=["image"], operations=decode_op, cache=some_cache) - print("ds1.dataset_size is ", ds1.get_dataset_size()) + logger.info("ds1.dataset_size is ", ds1.get_dataset_size()) num_iter = 0 for _ in ds1.create_dict_iterator(): - print("get data from dataset") + logger.info("get data from dataset") num_iter += 1 logger.info("Number of data in ds1: {} ".format(num_iter)) @@ -116,6 +116,31 @@ def test_cache_map_basic3(): logger.info('test_cache_basic3 Ended.\n') +def test_cache_map_basic4(): + """ + Test different rows result in core dump + """ + logger.info("Test cache basic 4") + some_cache = ds.DatasetCache(session_id=1, size=0, spilling=True) + + # This DATA_DIR only has 2 images in it + ds1 = ds.ImageFolderDatasetV2(dataset_dir=DATA_DIR, cache=some_cache) + decode_op = c_vision.Decode() + ds1 = ds1.repeat(4) + ds1 = ds1.map(input_columns=["image"], operations=decode_op) + logger.info("ds1.dataset_size is ", ds1.get_dataset_size()) + shape = ds1.output_shapes() + + num_iter = 0 + for _ in ds1.create_dict_iterator(): + logger.info("get data from dataset") + num_iter += 1 + + logger.info("Number of data in ds1: {} ".format(num_iter)) + assert num_iter == 8 + logger.info('test_cache_basic3 Ended.\n') + + def test_cache_map_failure1(): """ Test nested cache (failure) @@ -154,10 +179,12 @@ def test_cache_map_failure1(): if __name__ == '__main__': test_cache_map_basic1() - print("test_cache_map_basic1 success.") + logger.info("test_cache_map_basic1 success.") test_cache_map_basic2() - print("test_cache_map_basic2 success.") + logger.info("test_cache_map_basic2 success.") test_cache_map_basic3() - print("test_cache_map_basic3 success.") + logger.info("test_cache_map_basic3 success.") + test_cache_map_basic4() + logger.info("test_cache_map_basic3 success.") test_cache_map_failure1() - print("test_cache_map_failure1 success.") + logger.info("test_cache_map_failure1 success.") From b6529b909664e8f4d0570e6c6c3c893977179711 Mon Sep 17 00:00:00 2001 From: guansongsong Date: Tue, 28 Jul 2020 13:03:06 +0800 Subject: [PATCH 2/3] fix cache description --- mindspore/dataset/engine/datasets.py | 16 +++++++++++----- tests/ut/python/dataset/test_cache_map.py | 2 +- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/mindspore/dataset/engine/datasets.py b/mindspore/dataset/engine/datasets.py index 3319c7d53f..f2eab10d66 100644 --- a/mindspore/dataset/engine/datasets.py +++ b/mindspore/dataset/engine/datasets.py @@ -435,7 +435,8 @@ class Dataset: parallel (default=None, the value from the config will be used). python_multiprocessing (bool, optional): Parallelize python operations with multiple worker process. This option could be beneficial if the python operation is computational heavy (default=False). - cache (DatasetCache, optional): Tensor cache to use. (default=None which means no cache is used) + cache (DatasetCache, optional): Tensor cache to use. (default=None which means no cache is used). + The cache feature is under development and is not recommended. Returns: MapDataset, dataset after mapping operation. @@ -1951,7 +1952,9 @@ class MapDataset(DatasetOp): in parallel (default=None). python_multiprocessing (bool, optional): Parallelize python operations with multiple worker process. This option could be beneficial if the python operation is computational heavy (default=False). - cache (DatasetCache, optional): Tensor cache to use. (default=None which means no cache is used) + cache (DatasetCache, optional): Tensor cache to use. (default=None which means no cache is used). + The cache feature is under development and is not recommended. + Raises: ValueError: If len(input_columns) != len(output_columns) and columns_order is not specified. @@ -2552,7 +2555,8 @@ class ImageFolderDatasetV2(MappableDataset): into (default=None). shard_id (int, optional): The shard ID within num_shards (default=None). This argument should be specified only when num_shards is also specified. - cache (DatasetCache, optional): Tensor cache to use. (default=None which means no cache is used) + cache (DatasetCache, optional): Tensor cache to use. (default=None which means no cache is used). + The cache feature is under development and is not recommended. Raises: RuntimeError: If sampler and shuffle are specified at the same time. @@ -3348,7 +3352,8 @@ class TFRecordDataset(SourceDataset): argument should be specified only when num_shards is also specified. shard_equal_rows (bool): Get equal rows for all shards(default=False). If shard_equal_rows is false, number of rows of each shard may be not equal. - cache (DatasetCache, optional): Tensor cache to use. (default=None which means no cache is used) + cache (DatasetCache, optional): Tensor cache to use. (default=None which means no cache is used). + The cache feature is under development and is not recommended. Examples: >>> import mindspore.dataset as ds >>> import mindspore.common.dtype as mstype @@ -3919,7 +3924,8 @@ class RandomDataset(SourceDataset): num_samples (int): number of samples to draw from the total. (default=None, which means all rows) num_parallel_workers (int, optional): number of workers to read the data (default=None, number set in the config). - cache (DatasetCache, optional): Tensor cache to use. (default=None which means no cache is used) + cache (DatasetCache, optional): Tensor cache to use. (default=None which means no cache is used). + The cache feature is under development and is not recommended. shuffle (bool, optional): Whether or not to perform shuffle on the dataset (default=None, expected order behavior shown in the table). num_shards (int, optional): Number of shards that the dataset should be divided diff --git a/tests/ut/python/dataset/test_cache_map.py b/tests/ut/python/dataset/test_cache_map.py index 72f33be576..4f8af60fce 100644 --- a/tests/ut/python/dataset/test_cache_map.py +++ b/tests/ut/python/dataset/test_cache_map.py @@ -130,7 +130,7 @@ def test_cache_map_basic4(): ds1 = ds1.map(input_columns=["image"], operations=decode_op) logger.info("ds1.dataset_size is ", ds1.get_dataset_size()) shape = ds1.output_shapes() - + logger.info(shape) num_iter = 0 for _ in ds1.create_dict_iterator(): logger.info("get data from dataset") From 1906ed1be8d9189cdc33678c2a6eb2419ead600b Mon Sep 17 00:00:00 2001 From: guansongsong Date: Tue, 28 Jul 2020 13:26:29 +0800 Subject: [PATCH 3/3] fix cache description again --- mindspore/dataset/engine/datasets.py | 3 ++ tests/ut/python/dataset/test_cache_map.py | 46 ++++++++++++----------- 2 files changed, 27 insertions(+), 22 deletions(-) diff --git a/mindspore/dataset/engine/datasets.py b/mindspore/dataset/engine/datasets.py index f2eab10d66..93c5f7743c 100644 --- a/mindspore/dataset/engine/datasets.py +++ b/mindspore/dataset/engine/datasets.py @@ -2144,6 +2144,7 @@ class RepeatDataset(DatasetOp): """ return self.count + class SkipDataset(DatasetOp): """ The result of applying Skip operator to the input Dataset. @@ -2409,6 +2410,7 @@ class TransferDataset(DatasetOp): def stop_send(self): self.iterator.depipeline.StopSend() + class RangeDataset(MappableDataset): """ A source dataset that reads and parses datasets stored on disk in a range. @@ -5319,6 +5321,7 @@ class BuildVocabDataset(DatasetOp): return new_op + class BuildSentencePieceVocabDataset(DatasetOp): """ Build a SentencePieceVocab from a dataset. diff --git a/tests/ut/python/dataset/test_cache_map.py b/tests/ut/python/dataset/test_cache_map.py index 4f8af60fce..154a4208a0 100644 --- a/tests/ut/python/dataset/test_cache_map.py +++ b/tests/ut/python/dataset/test_cache_map.py @@ -24,6 +24,7 @@ DATA_DIR = "../data/dataset/testImageNetData/train/" GENERATE_GOLDEN = False + def test_cache_map_basic1(): """ Test mappable leaf with cache op right over the leaf @@ -117,28 +118,28 @@ def test_cache_map_basic3(): def test_cache_map_basic4(): - """ - Test different rows result in core dump - """ - logger.info("Test cache basic 4") - some_cache = ds.DatasetCache(session_id=1, size=0, spilling=True) - - # This DATA_DIR only has 2 images in it - ds1 = ds.ImageFolderDatasetV2(dataset_dir=DATA_DIR, cache=some_cache) - decode_op = c_vision.Decode() - ds1 = ds1.repeat(4) - ds1 = ds1.map(input_columns=["image"], operations=decode_op) - logger.info("ds1.dataset_size is ", ds1.get_dataset_size()) - shape = ds1.output_shapes() - logger.info(shape) - num_iter = 0 - for _ in ds1.create_dict_iterator(): - logger.info("get data from dataset") - num_iter += 1 - - logger.info("Number of data in ds1: {} ".format(num_iter)) - assert num_iter == 8 - logger.info('test_cache_basic3 Ended.\n') + """ + Test different rows result in core dump + """ + logger.info("Test cache basic 4") + some_cache = ds.DatasetCache(session_id=1, size=0, spilling=True) + + # This DATA_DIR only has 2 images in it + ds1 = ds.ImageFolderDatasetV2(dataset_dir=DATA_DIR, cache=some_cache) + decode_op = c_vision.Decode() + ds1 = ds1.repeat(4) + ds1 = ds1.map(input_columns=["image"], operations=decode_op) + logger.info("ds1.dataset_size is ", ds1.get_dataset_size()) + shape = ds1.output_shapes() + logger.info(shape) + num_iter = 0 + for _ in ds1.create_dict_iterator(): + logger.info("get data from dataset") + num_iter += 1 + + logger.info("Number of data in ds1: {} ".format(num_iter)) + assert num_iter == 8 + logger.info('test_cache_basic3 Ended.\n') def test_cache_map_failure1(): @@ -177,6 +178,7 @@ def test_cache_map_failure1(): assert num_iter == 0 logger.info('test_cache_failure1 Ended.\n') + if __name__ == '__main__': test_cache_map_basic1() logger.info("test_cache_map_basic1 success.")