From de43c11e2e510ef67da43c9dcec5ed0de23e818c Mon Sep 17 00:00:00 2001
From: Li Hongzhang <lihongzhang1@huawei.com>
Date: Thu, 6 Aug 2020 10:30:43 +0800
Subject: [PATCH] fix several issues

- handle collection for multiple trains
- how many tensors to collect when sunk
- change loglevel for get_learning_rate
- update calculation of `max_file_size`
- fix how collect_tensor_freq counting
---
 .../train/callback/_summary_collector.py      | 37 +++++++++++++------
 mindspore/train/summary/_summary_writer.py    |  8 ++--
 mindspore/train/summary/summary_record.py     |  1 +
 3 files changed, 32 insertions(+), 14 deletions(-)

diff --git a/mindspore/train/callback/_summary_collector.py b/mindspore/train/callback/_summary_collector.py
index 6ac0883268..04eee2dc6c 100644
--- a/mindspore/train/callback/_summary_collector.py
+++ b/mindspore/train/callback/_summary_collector.py
@@ -111,10 +111,10 @@ class SummaryCollector(Callback):
             Default: None, it means there is no custom data.
         collect_tensor_freq (Optional[int]): Same semantic as the `collect_freq`, but controls TensorSummary only.
             Because TensorSummary data is too large compared to other summary data, this parameter is used to reduce
-            its collection. By default, TensorSummary data will be collected at most 21 steps, but not more than how
+            its collection. By default, TensorSummary data will be collected at most 20 steps, but not more than how
             many steps other summary data will be collected.
             Default: None, which means to follow the behavior as described above. For example, given `collect_freq=10`,
-            when the total steps is 600, TensorSummary will be collected 21 steps, while other summary data 61 steps,
+            when the total steps is 600, TensorSummary will be collected 20 steps, while other summary data 61 steps,
             but when the total steps is 20, both TensorSummary and other summary will be collected 3 steps.
             Also note that when in parallel mode, the total steps will be splitted evenly, which will
             affect how many steps TensorSummary will be collected.
@@ -176,6 +176,7 @@ class SummaryCollector(Callback):
 
         self._check_positive('collect_tensor_freq', collect_tensor_freq, allow_none=True)
         self._collect_tensor_freq = collect_tensor_freq
+        self._tensor_collect_range = None
 
         self._check_positive('max_file_size', max_file_size, allow_none=True)
         self._max_file_size = max_file_size
@@ -296,12 +297,6 @@ class SummaryCollector(Callback):
 
         self._record.set_mode(cb_params.mode)
 
-        if cb_params.mode == ModeEnum.TRAIN.value:
-            if self._collect_tensor_freq is None:
-                default_tensor_summary_limit = 20
-                total_step = cb_params.epoch_num * cb_params.batch_num
-                self._collect_tensor_freq = max(self._collect_freq, total_step // default_tensor_summary_limit)
-
     def step_end(self, run_context):
         cb_params = run_context.original_args()
         if cb_params.mode != ModeEnum.TRAIN.value:
@@ -322,17 +317,36 @@ class SummaryCollector(Callback):
         if self._first_step:
             # Notice: This way of determining whether dataset sink mode is True does not work in the eval scenario
             self._dataset_sink_mode = cb_params.cur_step_num == cb_params.batch_num
+            self._tensor_collect_range = self._get_tensor_collect_range(cb_params, self._dataset_sink_mode)
             self._collect_at_step_end(cb_params, plugin_filter=None)
             self._first_step = False
         else:
             current = cb_params.cur_epoch_num if self._dataset_sink_mode else cb_params.cur_step_num
-            if current % self._collect_freq == 0 and current % self._collect_tensor_freq == 0:
+            if current % self._collect_freq == 0 and current in self._tensor_collect_range:
                 self._collect_at_step_end(cb_params, plugin_filter=None)
-            elif current % self._collect_tensor_freq == 0:
+            elif current in self._tensor_collect_range:
                 self._collect_at_step_end(cb_params, lambda plugin: plugin == PluginEnum.TENSOR.value)
             elif current % self._collect_freq == 0:
                 self._collect_at_step_end(cb_params, lambda plugin: plugin != PluginEnum.TENSOR.value)
 
+    def _get_tensor_collect_range(self, cb_params, dataset_sink_mode):
+        """Get tensor collect range."""
+        total_step = cb_params.epoch_num
+        if not dataset_sink_mode:
+            total_step *= cb_params.batch_num
+        if self._collect_tensor_freq is not None:
+            # `total_step + 1`: `total_step` would be a value of `cb_params.cur_step_num`.
+            return range(0, total_step + 1, self._collect_tensor_freq)
+        summary_to_collect = len(range(0, total_step + 1, self._collect_freq))
+        default_tensor_summary_limit = 20
+        if summary_to_collect > default_tensor_summary_limit:
+            tensor_freq = total_step // (default_tensor_summary_limit - 1)
+            if tensor_freq > 1:
+                return range(0, total_step + 1, tensor_freq)[:default_tensor_summary_limit]
+            # `cb_params.cur_step_num` counting from `1`, when `1` is in the range, take `1` more steps.
+            return range(0, total_step + 1)[:default_tensor_summary_limit + 1]
+        return range(0, total_step + 1, self._collect_freq)
+
     def _collect_at_step_end(self, cb_params, plugin_filter):
         self._collect_input_data(cb_params)
         self._collect_metric(cb_params)
@@ -577,7 +591,8 @@ class SummaryCollector(Callback):
         """
         learning_rate = optimizer.learning_rate
         if not isinstance(learning_rate, Parameter):
-            logger.info("The learning rate detected in the optimizer is not a Parameter type, so it is not recorded.")
+            logger.warning("The learning rate detected in the optimizer "
+                           "is not a Parameter type, so it is not recorded.")
             return None
         return learning_rate.data
 
diff --git a/mindspore/train/summary/_summary_writer.py b/mindspore/train/summary/_summary_writer.py
index 2c288e16c6..a6874ce713 100644
--- a/mindspore/train/summary/_summary_writer.py
+++ b/mindspore/train/summary/_summary_writer.py
@@ -20,6 +20,8 @@ from shutil import disk_usage
 from ..._c_expression import EventWriter_
 from ._summary_adapter import package_init_event
 
+FREE_DISK_SPACE_TIMES = 32
+
 
 class BaseWriter:
     """BaseWriter to be subclass."""
@@ -45,13 +47,13 @@ class BaseWriter:
 
     def write(self, plugin, data):
         """Write data to file."""
-        if self.writer and disk_usage(self._filepath).free < len(data) * 32:
-            raise RuntimeError(f"The disk space may be soon exhausted by the '{self._filepath}'.")
         # 8: data length
         # 4: crc32 of data length
         # 4: crc32 of data
         metadata_length = 8 + 4 + 4
         required_length = len(data) + metadata_length
+        if self.writer and disk_usage(self._filepath).free < required_length * FREE_DISK_SPACE_TIMES:
+            raise RuntimeError(f"The disk space may be soon exhausted by the '{self._filepath}'.")
         if self._max_file_size is None:
             self.writer.Write(data)
         elif self._max_file_size >= required_length:
@@ -77,7 +79,7 @@ class SummaryWriter(BaseWriter):
 
     def init_writer(self):
         """Write some metadata etc."""
-        self.writer.Write(package_init_event().SerializeToString())
+        self.write('summary', package_init_event().SerializeToString())
 
     def write(self, plugin, data):
         """Write data to file."""
diff --git a/mindspore/train/summary/summary_record.py b/mindspore/train/summary/summary_record.py
index 18cecb2914..aa72786ba4 100644
--- a/mindspore/train/summary/summary_record.py
+++ b/mindspore/train/summary/summary_record.py
@@ -156,6 +156,7 @@ class SummaryRecord:
                                         max_file_size,
                                         summary=self.full_file_name,
                                         lineage=get_event_file_name('events', '_lineage'))
+        _get_summary_tensor_data()
         atexit.register(self.close)
 
     def __enter__(self):