From bf87b1d1fb4ee0eda62a0f3b65ee934239397402 Mon Sep 17 00:00:00 2001
From: qianlong <qianlong3@huawei.com>
Date: Sat, 25 Jul 2020 18:05:53 +0800
Subject: [PATCH 1/3] fix cache op core dump

---
 .../dataset/engine/datasetops/dataset_op.cc   |  3 ++
 tests/ut/python/dataset/test_cache_map.py     | 39 ++++++++++++++++---
 2 files changed, 36 insertions(+), 6 deletions(-)
diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/dataset_op.cc b/mindspore/ccsrc/minddata/dataset/engine/datasetops/dataset_op.cc
index dd53e0527d..0e0db663d0 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/dataset_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/dataset_op.cc
@@ -392,6 +392,9 @@ uint32_t DatasetOp::GenerateCRC(const std::shared_ptr<DatasetOp> &op) {
   ss_str = std::regex_replace(ss_str, std::regex("Num workers.*\n"), "");
   ss_str = std::regex_replace(ss_str, std::regex("\\[workers.*\\]"), "");
 
+  // Filter out Number of rows when generating the check sum
+  ss_str = std::regex_replace(ss_str, std::regex("Number of rows.*\n"), "");
+
   // Filter out the Operator control flags field when generating the check sum
   ss_str = std::regex_replace(ss_str, std::regex("Operator control flags.*\n"), "");
 
diff --git a/tests/ut/python/dataset/test_cache_map.py b/tests/ut/python/dataset/test_cache_map.py
index 5de969db6d..72f33be576 100644
--- a/tests/ut/python/dataset/test_cache_map.py
+++ b/tests/ut/python/dataset/test_cache_map.py
@@ -104,11 +104,11 @@ def test_cache_map_basic3():
     decode_op = c_vision.Decode()
     ds1 = ds1.repeat(4)
     ds1 = ds1.map(input_columns=["image"], operations=decode_op, cache=some_cache)
-    print("ds1.dataset_size is ", ds1.get_dataset_size())
+    logger.info("ds1.dataset_size is ", ds1.get_dataset_size())
 
     num_iter = 0
     for _ in ds1.create_dict_iterator():
-        print("get data from dataset")
+        logger.info("get data from dataset")
         num_iter += 1
 
     logger.info("Number of data in ds1: {} ".format(num_iter))
@@ -116,6 +116,31 @@ def test_cache_map_basic3():
     logger.info('test_cache_basic3 Ended.\n')
 
 
+def test_cache_map_basic4():
+  """
+  Test different rows result in core dump
+  """
+  logger.info("Test cache basic 4")
+  some_cache = ds.DatasetCache(session_id=1, size=0, spilling=True)
+
+  # This DATA_DIR only has 2 images in it
+  ds1 = ds.ImageFolderDatasetV2(dataset_dir=DATA_DIR, cache=some_cache)
+  decode_op = c_vision.Decode()
+  ds1 = ds1.repeat(4)
+  ds1 = ds1.map(input_columns=["image"], operations=decode_op)
+  logger.info("ds1.dataset_size is ", ds1.get_dataset_size())
+  shape = ds1.output_shapes()
+
+  num_iter = 0
+  for _ in ds1.create_dict_iterator():
+      logger.info("get data from dataset")
+      num_iter += 1
+
+  logger.info("Number of data in ds1: {} ".format(num_iter))
+  assert num_iter == 8
+  logger.info('test_cache_basic3 Ended.\n')
+
+
 def test_cache_map_failure1():
     """
     Test nested cache (failure)
@@ -154,10 +179,12 @@ def test_cache_map_failure1():
 
 if __name__ == '__main__':
     test_cache_map_basic1()
-    print("test_cache_map_basic1 success.")
+    logger.info("test_cache_map_basic1 success.")
     test_cache_map_basic2()
-    print("test_cache_map_basic2 success.")
+    logger.info("test_cache_map_basic2 success.")
     test_cache_map_basic3()
-    print("test_cache_map_basic3 success.")
+    logger.info("test_cache_map_basic3 success.")
+    test_cache_map_basic4()
+    logger.info("test_cache_map_basic3 success.")
     test_cache_map_failure1()
-    print("test_cache_map_failure1 success.")
+    logger.info("test_cache_map_failure1 success.")

From b6529b909664e8f4d0570e6c6c3c893977179711 Mon Sep 17 00:00:00 2001
From: guansongsong <guansongsong>
Date: Tue, 28 Jul 2020 13:03:06 +0800
Subject: [PATCH 2/3] fix cache description

---
 mindspore/dataset/engine/datasets.py      | 16 +++++++++++-----
 tests/ut/python/dataset/test_cache_map.py |  2 +-
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/mindspore/dataset/engine/datasets.py b/mindspore/dataset/engine/datasets.py
index 3319c7d53f..f2eab10d66 100644
--- a/mindspore/dataset/engine/datasets.py
+++ b/mindspore/dataset/engine/datasets.py
@@ -435,7 +435,8 @@ class Dataset:
                 parallel (default=None, the value from the config will be used).
             python_multiprocessing (bool, optional): Parallelize python operations with multiple worker process. This
                 option could be beneficial if the python operation is computational heavy (default=False).
-            cache (DatasetCache, optional): Tensor cache to use. (default=None which means no cache is used)
+            cache (DatasetCache, optional): Tensor cache to use. (default=None which means no cache is used).
+                The cache feature is under development and is not recommended.
 
         Returns:
             MapDataset, dataset after mapping operation.
@@ -1951,7 +1952,9 @@ class MapDataset(DatasetOp):
             in parallel (default=None).
         python_multiprocessing (bool, optional): Parallelize python operations with multiple worker process. This
             option could be beneficial if the python operation is computational heavy (default=False).
-        cache (DatasetCache, optional): Tensor cache to use. (default=None which means no cache is used)
+        cache (DatasetCache, optional): Tensor cache to use. (default=None which means no cache is used).
+            The cache feature is under development and is not recommended.
+
 
         Raises:
             ValueError: If len(input_columns) != len(output_columns) and columns_order is not specified.
@@ -2552,7 +2555,8 @@ class ImageFolderDatasetV2(MappableDataset):
             into (default=None).
         shard_id (int, optional): The shard ID within num_shards (default=None). This
             argument should be specified only when num_shards is also specified.
-        cache (DatasetCache, optional): Tensor cache to use. (default=None which means no cache is used)
+        cache (DatasetCache, optional): Tensor cache to use. (default=None which means no cache is used).
+            The cache feature is under development and is not recommended.
 
     Raises:
         RuntimeError: If sampler and shuffle are specified at the same time.
@@ -3348,7 +3352,8 @@ class TFRecordDataset(SourceDataset):
             argument should be specified only when num_shards is also specified.
         shard_equal_rows (bool): Get equal rows for all shards(default=False). If shard_equal_rows is false, number
             of rows of each shard may be not equal.
-        cache (DatasetCache, optional): Tensor cache to use. (default=None which means no cache is used)
+        cache (DatasetCache, optional): Tensor cache to use. (default=None which means no cache is used).
+            The cache feature is under development and is not recommended.
     Examples:
         >>> import mindspore.dataset as ds
         >>> import mindspore.common.dtype as mstype
@@ -3919,7 +3924,8 @@ class RandomDataset(SourceDataset):
         num_samples (int): number of samples to draw from the total. (default=None, which means all rows)
         num_parallel_workers (int, optional): number of workers to read the data
             (default=None, number set in the config).
-        cache (DatasetCache, optional): Tensor cache to use. (default=None which means no cache is used)
+        cache (DatasetCache, optional): Tensor cache to use. (default=None which means no cache is used).
+            The cache feature is under development and is not recommended.
         shuffle (bool, optional): Whether or not to perform shuffle on the dataset
             (default=None, expected order behavior shown in the table).
         num_shards (int, optional): Number of shards that the dataset should be divided
diff --git a/tests/ut/python/dataset/test_cache_map.py b/tests/ut/python/dataset/test_cache_map.py
index 72f33be576..4f8af60fce 100644
--- a/tests/ut/python/dataset/test_cache_map.py
+++ b/tests/ut/python/dataset/test_cache_map.py
@@ -130,7 +130,7 @@ def test_cache_map_basic4():
   ds1 = ds1.map(input_columns=["image"], operations=decode_op)
   logger.info("ds1.dataset_size is ", ds1.get_dataset_size())
   shape = ds1.output_shapes()
-
+  logger.info(shape)
   num_iter = 0
   for _ in ds1.create_dict_iterator():
       logger.info("get data from dataset")

From 1906ed1be8d9189cdc33678c2a6eb2419ead600b Mon Sep 17 00:00:00 2001
From: guansongsong <guansongsong>
Date: Tue, 28 Jul 2020 13:26:29 +0800
Subject: [PATCH 3/3] fix cache description again

---
 mindspore/dataset/engine/datasets.py      |  3 ++
 tests/ut/python/dataset/test_cache_map.py | 46 ++++++++++++-----------
 2 files changed, 27 insertions(+), 22 deletions(-)

diff --git a/mindspore/dataset/engine/datasets.py b/mindspore/dataset/engine/datasets.py
index f2eab10d66..93c5f7743c 100644
--- a/mindspore/dataset/engine/datasets.py
+++ b/mindspore/dataset/engine/datasets.py
@@ -2144,6 +2144,7 @@ class RepeatDataset(DatasetOp):
         """
         return self.count
 
+
 class SkipDataset(DatasetOp):
     """
     The result of applying Skip operator to the input Dataset.
@@ -2409,6 +2410,7 @@ class TransferDataset(DatasetOp):
     def stop_send(self):
         self.iterator.depipeline.StopSend()
 
+
 class RangeDataset(MappableDataset):
     """
     A source dataset that reads and parses datasets stored on disk in a range.
@@ -5319,6 +5321,7 @@ class BuildVocabDataset(DatasetOp):
 
         return new_op
 
+
 class BuildSentencePieceVocabDataset(DatasetOp):
     """
     Build a SentencePieceVocab from a dataset.
diff --git a/tests/ut/python/dataset/test_cache_map.py b/tests/ut/python/dataset/test_cache_map.py
index 4f8af60fce..154a4208a0 100644
--- a/tests/ut/python/dataset/test_cache_map.py
+++ b/tests/ut/python/dataset/test_cache_map.py
@@ -24,6 +24,7 @@ DATA_DIR = "../data/dataset/testImageNetData/train/"
 
 GENERATE_GOLDEN = False
 
+
 def test_cache_map_basic1():
     """
     Test mappable leaf with cache op right over the leaf
@@ -117,28 +118,28 @@ def test_cache_map_basic3():
 
 
 def test_cache_map_basic4():
-  """
-  Test different rows result in core dump
-  """
-  logger.info("Test cache basic 4")
-  some_cache = ds.DatasetCache(session_id=1, size=0, spilling=True)
-
-  # This DATA_DIR only has 2 images in it
-  ds1 = ds.ImageFolderDatasetV2(dataset_dir=DATA_DIR, cache=some_cache)
-  decode_op = c_vision.Decode()
-  ds1 = ds1.repeat(4)
-  ds1 = ds1.map(input_columns=["image"], operations=decode_op)
-  logger.info("ds1.dataset_size is ", ds1.get_dataset_size())
-  shape = ds1.output_shapes()
-  logger.info(shape)
-  num_iter = 0
-  for _ in ds1.create_dict_iterator():
-      logger.info("get data from dataset")
-      num_iter += 1
-
-  logger.info("Number of data in ds1: {} ".format(num_iter))
-  assert num_iter == 8
-  logger.info('test_cache_basic3 Ended.\n')
+    """
+    Test different rows result in core dump
+    """
+    logger.info("Test cache basic 4")
+    some_cache = ds.DatasetCache(session_id=1, size=0, spilling=True)
+
+    # This DATA_DIR only has 2 images in it
+    ds1 = ds.ImageFolderDatasetV2(dataset_dir=DATA_DIR, cache=some_cache)
+    decode_op = c_vision.Decode()
+    ds1 = ds1.repeat(4)
+    ds1 = ds1.map(input_columns=["image"], operations=decode_op)
+    logger.info("ds1.dataset_size is ", ds1.get_dataset_size())
+    shape = ds1.output_shapes()
+    logger.info(shape)
+    num_iter = 0
+    for _ in ds1.create_dict_iterator():
+        logger.info("get data from dataset")
+        num_iter += 1
+
+    logger.info("Number of data in ds1: {} ".format(num_iter))
+    assert num_iter == 8
+    logger.info('test_cache_basic3 Ended.\n')
 
 
 def test_cache_map_failure1():
@@ -177,6 +178,7 @@ def test_cache_map_failure1():
     assert num_iter == 0
     logger.info('test_cache_failure1 Ended.\n')
 
+
 if __name__ == '__main__':
     test_cache_map_basic1()
     logger.info("test_cache_map_basic1 success.")