Browse Source

cross-step bucket unify

tags/v0.2.0-alpha
wenkai 6 years ago
parent
commit
c4c74bd6b4
7 changed files with 293 additions and 5 deletions
  1. +3
    -1
      mindinsight/datavisual/data_transform/events_data.py
  2. +98
    -0
      mindinsight/datavisual/data_transform/histogram_container.py
  3. +2
    -1
      mindinsight/datavisual/data_transform/ms_data_loader.py
  4. +117
    -0
      mindinsight/datavisual/data_transform/reservoir.py
  5. +2
    -3
      mindinsight/datavisual/processors/histogram_processor.py
  6. +34
    -0
      tests/ut/datavisual/data_transform/test_histogram_container.py
  7. +37
    -0
      tests/ut/datavisual/data_transform/test_reservoir.py

+ 3
- 1
mindinsight/datavisual/data_transform/events_data.py View File

@@ -93,7 +93,9 @@ class EventsData:
with self._reservoir_mutex_lock:
if tag not in self._reservoir_by_tag:
reservoir_size = self._get_reservoir_size(tensor_event.plugin_name)
self._reservoir_by_tag[tag] = reservoir.Reservoir(reservoir_size)
self._reservoir_by_tag[tag] = reservoir.ReservoirFactory().create_reservoir(
plugin_name, reservoir_size
)

tensor = _Tensor(wall_time=tensor_event.wall_time,
step=tensor_event.step,


+ 98
- 0
mindinsight/datavisual/data_transform/histogram_container.py View File

@@ -0,0 +1,98 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Histogram data container."""
import math

from mindinsight.datavisual.proto_files.mindinsight_summary_pb2 import Summary


def _mask_invalid_number(num):
"""Mask invalid number to 0."""
if math.isnan(num) or math.isinf(num):
return type(num)(0)

return num


class HistogramContainer:
"""
Histogram data container.

Args:
histogram_message (Summary.Histogram): Histogram message in summary file.
"""
def __init__(self, histogram_message: Summary.Histogram):
self._msg = histogram_message
self._original_buckets = tuple((bucket.left, bucket.width, bucket.count) for bucket in self._msg.buckets)
self._max = _mask_invalid_number(histogram_message.max)
self._min = _mask_invalid_number(histogram_message.min)
self._visual_max = self._max
self._visual_min = self._min
# default bin number
self._visual_bins = 10
self._count = self._msg.count
# Note that tuple is immutable, so sharing tuple is often safe.
self._re_sampled_buckets = self._original_buckets

@property
def max(self):
"""Gets max value of the tensor."""
return self._max

@property
def min(self):
"""Gets min value of the tensor."""
return self._min

@property
def count(self):
"""Gets valid number count of the tensor."""
return self._count

@property
def original_msg(self):
"""Get original proto message"""
return self._msg

def set_visual_range(self, max_val: float, min_val: float, bins: int) -> None:
"""
Sets visual range for later re-sampling.

It's caller's duty to ensure input is valid.

Args:
max_val (float): Max value for visual histogram.
min_val (float): Min value for visual histogram.
bins (int): Bins number for visual histogram.
"""
self._visual_max = max_val
self._visual_min = min_val
self._visual_bins = bins

# mark _re_sampled_buckets to empty
self._re_sampled_buckets = ()

def _re_sample_buckets(self):
# Will call re-sample logic in later PR.
self._re_sampled_buckets = self._original_buckets

def buckets(self):
"""
Get visual buckets instead of original buckets.
"""
if not self._re_sampled_buckets:
self._re_sample_buckets()

return self._re_sampled_buckets

+ 2
- 1
mindinsight/datavisual/data_transform/ms_data_loader.py View File

@@ -36,6 +36,7 @@ from mindinsight.datavisual.proto_files import mindinsight_summary_pb2 as summar
from mindinsight.datavisual.proto_files import mindinsight_anf_ir_pb2 as anf_ir_pb2
from mindinsight.datavisual.utils import crc32
from mindinsight.utils.exceptions import UnknownError
from mindinsight.datavisual.data_transform.histogram_container import HistogramContainer

HEADER_SIZE = 8
CRC_STR_SIZE = 4
@@ -235,7 +236,7 @@ class MSDataLoader:
self._events_data.add_tensor_event(tensor_event)

if value.HasField('histogram'):
histogram_msg = value.histogram
histogram_msg = HistogramContainer(value.histogram)
tag = '{}/{}'.format(value.tag, PluginNameEnum.HISTOGRAM.value)
tensor_event = TensorEvent(wall_time=event.wall_time,
step=event.step,


+ 117
- 0
mindinsight/datavisual/data_transform/reservoir.py View File

@@ -16,7 +16,9 @@

import random
import threading
import math

from mindinsight.datavisual.common.enums import PluginNameEnum
from mindinsight.utils.exceptions import ParamValueError


@@ -106,3 +108,118 @@ class Reservoir:
round(self._sample_counter * sample_remaining_rate))

return remove_size


class _VisualRange:
"""Simple helper class to merge visual ranges."""
def __init__(self):
self._max = 0.0
self._min = 0.0
self._updated = False

def update(self, max_val: float, min_val: float) -> None:
"""
Merge visual range with given range.

Args:
max_val (float): Max value of given range.
min_val (float): Min value of given range.

"""
if not self._updated:
self._max = max_val
self._min = min_val
self._updated = True
return

if max_val > self._max:
self._max = max_val

if min_val < self._min:
self._min = min_val

@property
def max(self):
"""Gets max value of current range."""
return self._max

@property
def min(self):
"""Gets min value of current range."""
return self._min


class HistogramReservoir(Reservoir):
"""
Reservoir for histogram, which needs updating range over all steps.

Args:
size (int): Container Size. If the size is 0, the container is not limited.
"""
def __init__(self, size):
super().__init__(size)

def samples(self):
"""Return all stored samples."""
with self._mutex:
# calc visual range
visual_range = _VisualRange()
max_count = 0
for sample in self._samples:
histogram = sample.value
if histogram.count == 0:
# ignore empty tensor
continue
max_count = max(histogram.count, max_count)
visual_range.update(histogram.max, histogram.min)

bins = self._calc_bins(max_count)

# update visual range
for sample in self._samples:
histogram = sample.value
histogram.set_visual_range(visual_range.max, visual_range.min, bins)

return list(self._samples)

def _calc_bins(self, count):
"""
Calculates experience-based optimal bins number.

To suppress re-sample bias, there should be enough number in each bin. So we calc bin numbers according to
count. For very small count(1 - 10), we assign carefully chosen number. For large count, we tried to make
sure there are 9-10 numbers in each bucket on average. Too many bins will also distract users, so we set max
number of bins to 30.
"""
number_per_bucket = 10
max_bins = 30

if not count:
return 1
if count <= 5:
return 2
if count <= 10:
return 3
if count <= 280:
# note that math.ceil(281/10) + 1 = 30
return math.ceil(count / number_per_bucket) + 1

return max_bins


class ReservoirFactory:
"""Factory class to get reservoir instances."""
def create_reservoir(self, plugin_name: str, size: int) -> Reservoir:
"""
Creates reservoir for given plugin name.

Args:
plugin_name (str): Plugin name
size (int): Container Size. If the size is 0, the container is not limited.

Returns:
Reservoir, reservoir instance for given plugin name.
"""
if plugin_name == PluginNameEnum.HISTOGRAM.value:
return HistogramReservoir(size)
return Reservoir(size)

+ 2
- 3
mindinsight/datavisual/processors/histogram_processor.py View File

@@ -53,9 +53,8 @@ class HistogramProcessor(BaseProcessor):

histograms = []
for tensor in tensors:
buckets = []
for bucket in tensor.value.buckets:
buckets.append([bucket.left, bucket.width, bucket.count])
histogram = tensor.value
buckets = histogram.buckets()
histograms.append({
"wall_time": tensor.wall_time,
"step": tensor.step,


+ 34
- 0
tests/ut/datavisual/data_transform/test_histogram_container.py View File

@@ -0,0 +1,34 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Test histogram."""
import unittest.mock as mock

from mindinsight.datavisual.data_transform import histogram_container as hist


class TestHistogram:
"""Test histogram."""
def test_get_buckets(self):
"""Test get buckets."""
mocked_input = mock.MagicMock()
mocked_bucket = mock.MagicMock()
mocked_bucket.left = 0
mocked_bucket.width = 1
mocked_bucket.count = 1
mocked_input.buckets = [mocked_bucket]
histogram = hist.HistogramContainer(mocked_input)
histogram.set_visual_range(max_val=1, min_val=0, bins=1)
buckets = histogram.buckets()
assert len(buckets) == 1

+ 37
- 0
tests/ut/datavisual/data_transform/test_reservoir.py View File

@@ -0,0 +1,37 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Test reservoir."""
import unittest.mock as mock

import mindinsight.datavisual.data_transform.reservoir as reservoir


class TestHistogramReservoir:
"""Test histogram reservoir."""
def test_samples(self):
"""Test get samples."""
my_reservoir = reservoir.ReservoirFactory().create_reservoir(reservoir.PluginNameEnum.HISTOGRAM.value, size=10)
sample1 = mock.MagicMock()
sample1.value.count = 1
sample1.value.max = 102
sample1.value.min = 101
sample2 = mock.MagicMock()
sample2.value.count = 2
sample2.value.max = 102
sample2.value.min = 101
my_reservoir.add_sample(sample1)
my_reservoir.add_sample(sample2)
samples = my_reservoir.samples()
assert len(samples) == 2

Loading…
Cancel
Save