|
|
|
@@ -108,6 +108,10 @@ class SummaryCollector(Callback): |
|
|
|
custom_lineage_data (Union[dict, None]): Allows you to customize the data and present it on the MingInsight |
|
|
|
lineage page. In the custom data, the key type support str, and the value type support str/int/float. |
|
|
|
Default: None, it means there is no custom data. |
|
|
|
collect_tensor_freq (Optional[int]): Same as the `collect_freq`, but controls TensorSummary specifically. |
|
|
|
Default: None, which means the frequency is auto-calculated just to collect at most 50 steps TensorSummary. |
|
|
|
max_file_size (Optional[int]): The maximum size in bytes each file can be written to the disk. |
|
|
|
Default: None, which means no limit. |
|
|
|
|
|
|
|
Raises: |
|
|
|
ValueError: If the parameter value is not expected. |
|
|
|
@@ -145,16 +149,28 @@ class SummaryCollector(Callback): |
|
|
|
'histogram_regular': None |
|
|
|
} |
|
|
|
|
|
|
|
def __init__(self, summary_dir, collect_freq=10, collect_specified_data=None, |
|
|
|
keep_default_action=True, custom_lineage_data=None): |
|
|
|
def __init__(self, |
|
|
|
summary_dir, |
|
|
|
collect_freq=10, |
|
|
|
collect_specified_data=None, |
|
|
|
keep_default_action=True, |
|
|
|
custom_lineage_data=None, |
|
|
|
collect_tensor_freq=None, |
|
|
|
max_file_size=None): |
|
|
|
super(SummaryCollector, self).__init__() |
|
|
|
|
|
|
|
self._summary_dir = self._process_summary_dir(summary_dir) |
|
|
|
self._record = None |
|
|
|
|
|
|
|
self._check_collect_freq(collect_freq) |
|
|
|
self._check_positive('collect_freq', collect_freq) |
|
|
|
self._collect_freq = collect_freq |
|
|
|
|
|
|
|
self._check_positive('collect_tensor_freq', collect_tensor_freq, allow_none=True) |
|
|
|
self._collect_tensor_freq = collect_tensor_freq |
|
|
|
|
|
|
|
self._check_positive('max_file_size', max_file_size, allow_none=True) |
|
|
|
self._max_file_size = max_file_size |
|
|
|
|
|
|
|
self._check_action(keep_default_action) |
|
|
|
|
|
|
|
self._collect_specified_data = self._process_specified_data(collect_specified_data, keep_default_action) |
|
|
|
@@ -165,16 +181,14 @@ class SummaryCollector(Callback): |
|
|
|
self._custom_lineage_data = custom_lineage_data |
|
|
|
|
|
|
|
self._temp_optimizer = None |
|
|
|
self._has_saved_train_network = False |
|
|
|
self._has_saved_custom_data = False |
|
|
|
self._is_parse_loss_success = True |
|
|
|
self._first_step = True |
|
|
|
self._dataset_sink_mode = True |
|
|
|
|
|
|
|
def __enter__(self): |
|
|
|
self._first_step = True |
|
|
|
self._dataset_sink_mode = True |
|
|
|
self._record = SummaryRecord(log_dir=self._summary_dir) |
|
|
|
self._record = SummaryRecord(log_dir=self._summary_dir, max_file_size=self._max_file_size) |
|
|
|
self._first_step, self._dataset_sink_mode = True, True |
|
|
|
return self |
|
|
|
|
|
|
|
def __exit__(self, *err): |
|
|
|
@@ -198,11 +212,13 @@ class SummaryCollector(Callback): |
|
|
|
return summary_dir |
|
|
|
|
|
|
|
@staticmethod |
|
|
|
def _check_collect_freq(freq): |
|
|
|
"""Check collect freq type and value.""" |
|
|
|
check_value_type('collect_freq', freq, int) |
|
|
|
if freq <= 0: |
|
|
|
raise ValueError(f'For `collect_freq` the value should be greater than 0, but got `{freq}`.') |
|
|
|
def _check_positive(name, value, allow_none=False): |
|
|
|
"""Check if the value to be int type and positive.""" |
|
|
|
if allow_none: |
|
|
|
return |
|
|
|
check_value_type(name, value, int) |
|
|
|
if value <= 0: |
|
|
|
raise ValueError(f'For `{name}` the value should be greater than 0, but got `{value}`.') |
|
|
|
|
|
|
|
@staticmethod |
|
|
|
def _check_custom_lineage_data(custom_lineage_data): |
|
|
|
@@ -276,6 +292,9 @@ class SummaryCollector(Callback): |
|
|
|
self._collect_graphs(cb_params) |
|
|
|
|
|
|
|
self._collect_dataset_graph(cb_params) |
|
|
|
if self._collect_tensor_freq is None: |
|
|
|
total_step = cb_params.epoch_num * cb_params.batch_num |
|
|
|
self._collect_tensor_freq = max(self._collect_freq, total_step // 50) |
|
|
|
|
|
|
|
if self._custom_lineage_data and not self._has_saved_custom_data: |
|
|
|
packaged_custom_data = self._package_custom_lineage_data(self._custom_lineage_data) |
|
|
|
@@ -287,24 +306,29 @@ class SummaryCollector(Callback): |
|
|
|
|
|
|
|
def step_end(self, run_context): |
|
|
|
cb_params = run_context.original_args() |
|
|
|
if cb_params.mode != ModeEnum.TRAIN.value: |
|
|
|
return |
|
|
|
if self._first_step: |
|
|
|
# Notice: This way of determining whether dataset sink mode is True does not work in the eval scenario |
|
|
|
self._dataset_sink_mode = bool(cb_params.cur_step_num == cb_params.batch_num) |
|
|
|
|
|
|
|
if cb_params.mode == ModeEnum.TRAIN.value: |
|
|
|
|
|
|
|
if not self._is_collect_this_step(cb_params): |
|
|
|
return |
|
|
|
self._dataset_sink_mode = cb_params.cur_step_num == cb_params.batch_num |
|
|
|
self._collect_at_step_end(cb_params, plugin_filter=None) |
|
|
|
self._first_step = False |
|
|
|
else: |
|
|
|
current = cb_params.cur_epoch_num if self._dataset_sink_mode else cb_params.cur_step_num |
|
|
|
if current % self._collect_freq == 0 and current % self._collect_tensor_freq == 0: |
|
|
|
self._collect_at_step_end(cb_params, plugin_filter=None) |
|
|
|
elif current % self._collect_tensor_freq == 0: |
|
|
|
self._collect_at_step_end(cb_params, lambda plugin: plugin == PluginEnum.TENSOR.value) |
|
|
|
elif current % self._collect_freq == 0: |
|
|
|
self._collect_at_step_end(cb_params, lambda plugin: plugin != PluginEnum.TENSOR.value) |
|
|
|
|
|
|
|
if not self._has_saved_train_network: |
|
|
|
self._collect_graphs(cb_params) |
|
|
|
|
|
|
|
self._collect_input_data(cb_params) |
|
|
|
self._collect_metric(cb_params) |
|
|
|
self._collect_histogram(cb_params) |
|
|
|
def _collect_at_step_end(self, cb_params, plugin_filter): |
|
|
|
self._collect_input_data(cb_params) |
|
|
|
self._collect_metric(cb_params) |
|
|
|
self._collect_histogram(cb_params) |
|
|
|
self._record.record(cb_params.cur_step_num, plugin_filter=plugin_filter) |
|
|
|
|
|
|
|
self._first_step = False |
|
|
|
self._record.record(cb_params.cur_step_num) |
|
|
|
|
|
|
|
def end(self, run_context): |
|
|
|
cb_params = run_context.original_args() |
|
|
|
@@ -331,18 +355,6 @@ class SummaryCollector(Callback): |
|
|
|
raise ValueError(f"There are more than one {self.__class__.__name__} instance in callback list," |
|
|
|
f"but expected only one {self.__class__.__name__} instance.") |
|
|
|
|
|
|
|
def _is_collect_this_step(self, cb_params): |
|
|
|
"""Decide whether to collect data for the current step.""" |
|
|
|
# Make sure the first step data is recorded |
|
|
|
if not self._first_step: |
|
|
|
if self._dataset_sink_mode: |
|
|
|
if cb_params.cur_epoch_num % self._collect_freq: |
|
|
|
return False |
|
|
|
else: |
|
|
|
if cb_params.cur_step_num % self._collect_freq: |
|
|
|
return False |
|
|
|
return True |
|
|
|
|
|
|
|
@staticmethod |
|
|
|
def _package_custom_lineage_data(custom_lineage_data): |
|
|
|
""" |
|
|
|
@@ -411,7 +423,6 @@ class SummaryCollector(Callback): |
|
|
|
if graph_proto is None: |
|
|
|
return |
|
|
|
|
|
|
|
self._has_saved_train_network = True |
|
|
|
self._record.add_value(PluginEnum.GRAPH.value, 'train_network/auto', graph_proto) |
|
|
|
|
|
|
|
def _collect_metric(self, cb_params): |
|
|
|
|