Browse Source

!15963 Support getting dynamic_min_max_shape of data

From: @luoyang42
Reviewed-by: 
Signed-off-by:
tags/v1.3.0
mindspore-ci-bot Gitee 4 years ago
parent
commit
2433778dac
5 changed files with 250 additions and 1 deletions
  1. +12
    -0
      mindspore/dataset/core/config.py
  2. +80
    -1
      mindspore/dataset/engine/datasets.py
  3. +3
    -0
      mindspore/train/dataset_helper.py
  4. +3
    -0
      tests/dataset_mock.py
  5. +152
    -0
      tests/ut/python/dataset/test_datasets_get_dynamic_shape.py

+ 12
- 0
mindspore/dataset/core/config.py View File

@@ -32,6 +32,18 @@ INT32_MAX = 2147483647
UINT32_MAX = 4294967295

_config = cde.GlobalContext.config_manager()
_dynamic_columns = dict()


def set_dynamic_columns(columns=None):
global _dynamic_columns
if not isinstance(columns, dict):
raise TypeError("Pass a dict to set dynamic shape, example: {\"data1\": [16, None, 256]}")
_dynamic_columns = columns


def get_dynamic_columns():
return _dynamic_columns


def _init_device_info():


+ 80
- 1
mindspore/dataset/engine/datasets.py View File

@@ -60,7 +60,7 @@ from .validators import check_batch, check_shuffle, check_map, check_filter, che
check_random_dataset, check_split, check_bucket_batch_by_length, check_cluedataset, check_save, check_csvdataset, \
check_paddeddataset, check_tuple_iterator, check_dict_iterator, check_schema, check_to_device_send
from ..core.config import get_callback_timeout, _init_device_info, get_enable_shared_mem, get_num_parallel_workers, \
get_prefetch_size
get_prefetch_size, get_dynamic_columns
from ..core.datatypes import mstype_to_detype, mstypelist_to_detypelist
from ..core.validator_helpers import replace_none

@@ -211,6 +211,9 @@ class Dataset:
self._num_classes = None
self._repeat_count = None
self._class_indexing = None
self.min_shapes = None
self.max_shapes = None
self.dynamic_shapes = None
self._sync = False

def create_ir_tree(self):
@@ -1556,6 +1559,82 @@ class Dataset:
self.close_pool()
return self.dataset_size

def get_dynamic_min_max_shape(self):
"""
Get dynamic information of source data.

Returns:
lists, min_shapes, max_shapes, dynamic_shapes of source data.
"""
# Assume data1 shape is dynamic, data2 shape is fix
# {"data1": [batch_size, None, feat_len], "data2": [batch_size, feat_len]}
dynamic_columns = get_dynamic_columns()
if not dynamic_columns:
raise RuntimeError("dynamic_columns is not set, call set_dynamic_columns() first.")

if self.min_shapes is not None and self.max_shapes is not None and self.dynamic_shapes is not None:
return self.min_shapes, self.max_shapes, self.dynamic_shapes

logger.warning("Calculating dynamic shape of input data, this will take a few minutes...")

# ["data1", "data2"]
dataset_columns = self.get_col_names()
for column in dynamic_columns:
if column not in dataset_columns:
raise RuntimeError("dynamic column [" + column + "] does not match any column in dataset: " +
str(dataset_columns))

# Shape[1] of data1 is variable
# {"data1": {(batch_size, 100, feat_len), (16, 200, 83)}, "data2": {(batch_size, feat_len)}}
column_shape_set = {col: set() for col in dataset_columns}
dataset_size_counter = 0
for data in self.create_dict_iterator(num_epochs=1, output_numpy=True):
dataset_size_counter += 1
for col in data.keys():
if col in dynamic_columns:
shape_mismatch = "dynamic column [" + col + "] with shape " + str(dynamic_columns[col]) + \
" does not match dataset column [" + col + "] with shape " + str(list(data[col].shape))
if data[col].ndim != len(dynamic_columns[col]):
raise RuntimeError(shape_mismatch)
for dim in range(len(dynamic_columns[col])):
if dynamic_columns[col][dim] is not None and dynamic_columns[col][dim] != data[col].shape[dim]:
raise RuntimeError(shape_mismatch)
column_shape_set[col].add(tuple(data[col].shape))

# we get dataset_size after dryrun
self.dataset_size = dataset_size_counter

min_shapes, max_shapes, dynamic_shapes = list(), list(), list()
for col, shape_set in column_shape_set.items():
if len(shape_set) > 1:
if col not in dynamic_columns:
raise RuntimeError("column [" + col + "] has dynamic shape but not set by set_dynamic_columns()" +
", shapes of [" + col + "]: " + str(list(shape_set)))
shape_npy = np.array(list(shape_set))
max_shape = shape_npy.max(axis=0)
min_shape = shape_npy.min(axis=0)

# Set min shape to 1 due to unknown shuffle
min_shape = np.where(np.equal(dynamic_columns[col], None), 1, min_shape)
# Set dynamic dim to -1 for ME
dynamic_shape = np.where(np.equal(dynamic_columns[col], None), -1, dynamic_columns[col])

max_shapes.append(max_shape.tolist())
min_shapes.append(min_shape.tolist())
dynamic_shapes.append(dynamic_shape.tolist())
else:
# Also append fix shape to keep order of column shape
if col in dynamic_columns:
logger.warning("column [" + col + "] has no dynamic shape but set by set_dynamic_columns()")
fix_shape = list(list(shape_set)[0])
max_shapes.append(fix_shape)
min_shapes.append(fix_shape)
dynamic_shapes.append(fix_shape)
self.min_shapes = min_shapes
self.max_shapes = max_shapes
self.dynamic_shapes = dynamic_shapes
return self.min_shapes, self.max_shapes, self.dynamic_shapes

def num_classes(self):
"""
Get the number of classes in a dataset.


+ 3
- 0
mindspore/train/dataset_helper.py View File

@@ -254,6 +254,8 @@ class DatasetHelper:
def get_data_info(self):
return self.iter.get_data_info()

def get_dynamic_min_max_shape(self):
return self.iter.get_dynamic_min_max_shape()

class _DatasetIter:
"""Base iter for dataset helper"""
@@ -283,6 +285,7 @@ class _DatasetIter:
self.release = dataset.__transfer_dataset__.release
self.continue_send = dataset.__transfer_dataset__.continue_send
self.get_data_info = dataset.__transfer_dataset__.get_data_info
self.get_dynamic_min_max_shape = dataset.__transfer_dataset__.get_dynamic_min_max_shape
self.dataset_types, self.dataset_shapes = _get_types_and_shapes(dataset)

def __iter__(self):


+ 3
- 0
tests/dataset_mock.py View File

@@ -73,6 +73,9 @@ class MindData:
def get_data_info(self):
pass

def get_dynamic_min_max_shape(self):
pass

def __len__(self):
return self._size



+ 152
- 0
tests/ut/python/dataset/test_datasets_get_dynamic_shape.py View File

@@ -0,0 +1,152 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import numpy as np
import pytest
import mindspore.dataset as ds
from mindspore import log as logger


def generator0():
for i in range(50, 70):
yield (np.ones((32, i)), np.zeros((16, i, i, 3)), np.ones((i)))


def test_get_dynamic_min_max_shape_0():
logger.info("Test get_dynamic_min_max_shape with dynamic shape columns")

dataset = ds.GeneratorDataset(generator0, ["data1", "data2", "data3"])

# config dynamic shape
ds.config.set_dynamic_columns(columns={"data1": [32, None], "data2": [16, None, None, 3], "data3": [None]})

# get dynamic information
min_shapes, max_shapes, dynamic_shapes = dataset.get_dynamic_min_max_shape()

# check result
np.testing.assert_array_equal(min_shapes, [[32, 1], [16, 1, 1, 3], [1]])
np.testing.assert_array_equal(max_shapes, [[32, 69], [16, 69, 69, 3], [69]])
np.testing.assert_array_equal(dynamic_shapes, [[32, -1], [16, -1, -1, 3], [-1]])


def generator1():
for i in range(1, 100):
yield (np.ones((16, i, 83)), np.array((i)))


def test_get_dynamic_min_max_shape_1():
logger.info("Test get_dynamic_min_max_shape with dynamic shape column and fix shape column")

dataset = ds.GeneratorDataset(generator1, ["data1", "data2"])

# config dynamic shape
ds.config.set_dynamic_columns(columns={"data1": [16, None, 83], "data2": []})

# get dynamic information
min_shapes, max_shapes, dynamic_shapes = dataset.get_dynamic_min_max_shape()

# check result
# raise a warning to tell user "data2" is not dynamic
np.testing.assert_array_equal(min_shapes, [[16, 1, 83], []])
np.testing.assert_array_equal(max_shapes, [[16, 99, 83], []])
np.testing.assert_array_equal(dynamic_shapes, [[16, -1, 83], []])


def test_get_dynamic_min_max_shape_2():
logger.info("Test get_dynamic_min_max_shape with all dynamic config")

dataset = ds.GeneratorDataset(generator1, ["data1", "data2"])

# config all dims have dynamic shape
ds.config.set_dynamic_columns(columns={"data1": [None, None, None]})
min_shapes, max_shapes, dynamic_shapes = dataset.get_dynamic_min_max_shape()

# check result
# Although shape[0] of data1 is fix in given data, user think it is dynamic, so shape[0] is dynamic
np.testing.assert_array_equal(min_shapes, [[1, 1, 1], []])
np.testing.assert_array_equal(max_shapes, [[16, 99, 83], []])
np.testing.assert_array_equal(dynamic_shapes, [[-1, -1, -1], []])


def generator2():
for i in range(80, 100):
yield (np.ones((16, i, 83)), np.ones((5, 5)))


def test_get_dynamic_min_max_shape_3():
logger.info("Test get_dynamic_min_max_shape with only config dynamic column")

dataset = ds.GeneratorDataset(generator2, ["data1", "data2"])

# only dynamic shape is required to config
ds.config.set_dynamic_columns(columns={"data1": [16, None, 83]})

# get dynamic information
min_shapes, max_shapes, dynamic_shapes = dataset.get_dynamic_min_max_shape()

# check result
# column with fix shape will be also appended to shapes list
np.testing.assert_array_equal(min_shapes, [[16, 1, 83], [5, 5]])
np.testing.assert_array_equal(max_shapes, [[16, 99, 83], [5, 5]])
np.testing.assert_array_equal(dynamic_shapes, [[16, -1, 83], [5, 5]])


def test_get_dynamic_min_max_shape_4():
logger.info("Test get_dynamic_min_max_shape with unexpected column setting")

dataset = ds.GeneratorDataset(generator1, ["data1", "data2"])

with pytest.raises(TypeError) as info:
# dynamic column is not in dict
ds.config.set_dynamic_columns(columns=list())
assert "Pass a dict to set dynamic shape" in str(info.value)

with pytest.raises(RuntimeError) as info:
# dynamic column is not set
ds.config.set_dynamic_columns(columns=dict())
dataset.get_dynamic_min_max_shape()
assert "dynamic_columns is not set, call set_dynamic_columns() first" in str(info.value)

with pytest.raises(RuntimeError) as info:
# dynamic column is not set
ds.config.set_dynamic_columns(columns={"data2": []})
dataset.get_dynamic_min_max_shape()
assert "column [data1] has dynamic shape but not set by set_dynamic_columns()" in str(info.value)

with pytest.raises(RuntimeError) as info:
# column does not exist
ds.config.set_dynamic_columns(columns={"data3": [16, None, 83]})
dataset.get_dynamic_min_max_shape()
assert "dynamic column [data3] does not match any column in dataset" in str(info.value)

with pytest.raises(RuntimeError) as info:
# unexpected column shape
ds.config.set_dynamic_columns(columns={"data1": [16, 83, None]})
dataset.get_dynamic_min_max_shape()
assert "shape [16, 83, None] does not match dataset column [data1] with shape [16, 1, 83]" in str(info.value)

with pytest.raises(RuntimeError) as info:
# unexpected column shape
ds.config.set_dynamic_columns(columns={"data1": [16, None]})
dataset.get_dynamic_min_max_shape()
assert "shape [16, None] does not match dataset column [data1] with shape [16, 1, 83]" in str(info.value)


if __name__ == "__main__":
test_get_dynamic_min_max_shape_0()
test_get_dynamic_min_max_shape_1()
test_get_dynamic_min_max_shape_2()
test_get_dynamic_min_max_shape_3()
test_get_dynamic_min_max_shape_4()

Loading…
Cancel
Save