You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

test_pad_batch.py 10 kB

5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236
  1. # Copyright 2020-2022 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ==============================================================================
  15. import time
  16. import numpy as np
  17. import mindspore.dataset as ds
  18. CIFAR10_DIR = "../data/dataset/testCifar10Data"
  19. # This UT test tests the following cases
  20. # 1. padding: input_shape=[x] output_shape=[y] where y > x
  21. # 2. padding in one dimension and truncate in the other. input_shape=[x1,x2] output_shape=[y1,y2] y1>x1 and y2<x2
  22. # 3. automatic padding for a specific column
  23. # 4. default setting for all columns
  24. # 5. test None in different places
  25. # this generator function yield two columns
  26. # col1d: [0],[1], [2], [3]
  27. # col2d: [[100],[200]], [[101],[201]], [102],[202]], [103],[203]]
  28. def gen_2cols(num):
  29. for i in range(num):
  30. yield (np.array([i]), np.array([[i + 100], [i + 200]]))
  31. # this generator function yield one column of variable shapes
  32. # col: [0], [0,1], [0,1,2], [0,1,2,3]
  33. def gen_var_col(num):
  34. for i in range(num):
  35. yield (np.array([j for j in range(i + 1)]),)
  36. # this generator function yield two columns of variable shapes
  37. # col1: [0], [0,1], [0,1,2], [0,1,2,3]
  38. # col2: [100], [100,101], [100,101,102], [100,110,102,103]
  39. def gen_var_cols(num):
  40. for i in range(num):
  41. yield (np.array([j for j in range(i + 1)]), np.array([100 + j for j in range(i + 1)]))
  42. # this generator function yield two columns of variable shapes
  43. # col1: [[0]], [[0,1]], [[0,1,2]], [[0,1,2,3]]
  44. # col2: [[100]], [[100,101]], [[100,101,102]], [[100,110,102,103]]
  45. def gen_var_cols_2d(num):
  46. for i in range(num):
  47. yield (np.array([[j for j in range(i + 1)]]), np.array([[100 + j for j in range(i + 1)]]))
  48. def test_batch_padding_01():
  49. data1 = ds.GeneratorDataset((lambda: gen_2cols(2)), ["col1d", "col2d"])
  50. data1 = data1.batch(batch_size=2, drop_remainder=False, pad_info={"col2d": ([2, 2], -2), "col1d": ([2], -1)})
  51. data1 = data1.repeat(2)
  52. for data in data1.create_dict_iterator(num_epochs=1, output_numpy=True):
  53. np.testing.assert_array_equal([[0, -1], [1, -1]], data["col1d"])
  54. np.testing.assert_array_equal([[[100, -2], [200, -2]], [[101, -2], [201, -2]]], data["col2d"])
  55. def test_batch_padding_02():
  56. data1 = ds.GeneratorDataset((lambda: gen_2cols(2)), ["col1d", "col2d"])
  57. data1 = data1.batch(batch_size=2, drop_remainder=False, pad_info={"col2d": ([1, 2], -2)})
  58. data1 = data1.repeat(2)
  59. for data in data1.create_dict_iterator(num_epochs=1, output_numpy=True):
  60. np.testing.assert_array_equal([[0], [1]], data["col1d"])
  61. np.testing.assert_array_equal([[[100, -2]], [[101, -2]]], data["col2d"])
  62. def test_batch_padding_03():
  63. data1 = ds.GeneratorDataset((lambda: gen_var_col(4)), ["col"])
  64. data1 = data1.batch(batch_size=2, drop_remainder=False, pad_info={"col": (None, -1)}) # pad automatically
  65. data1 = data1.repeat(2)
  66. res = dict()
  67. for ind, data in enumerate(data1.create_dict_iterator(num_epochs=1, output_numpy=True)):
  68. res[ind] = data["col"].copy()
  69. np.testing.assert_array_equal(res[0], [[0, -1], [0, 1]])
  70. np.testing.assert_array_equal(res[1], [[0, 1, 2, -1], [0, 1, 2, 3]])
  71. np.testing.assert_array_equal(res[2], [[0, -1], [0, 1]])
  72. np.testing.assert_array_equal(res[3], [[0, 1, 2, -1], [0, 1, 2, 3]])
  73. def test_batch_padding_04():
  74. data1 = ds.GeneratorDataset((lambda: gen_var_cols(2)), ["col1", "col2"])
  75. data1 = data1.batch(batch_size=2, drop_remainder=False, pad_info={}) # pad automatically
  76. data1 = data1.repeat(2)
  77. for data in data1.create_dict_iterator(num_epochs=1, output_numpy=True):
  78. np.testing.assert_array_equal(data["col1"], [[0, 0], [0, 1]])
  79. np.testing.assert_array_equal(data["col2"], [[100, 0], [100, 101]])
  80. def test_batch_padding_05():
  81. data1 = ds.GeneratorDataset((lambda: gen_var_cols_2d(3)), ["col1", "col2"])
  82. data1 = data1.batch(batch_size=3, drop_remainder=False,
  83. pad_info={"col2": ([2, None], -2), "col1": (None, -1)}) # pad automatically
  84. for data in data1.create_dict_iterator(num_epochs=1, output_numpy=True):
  85. np.testing.assert_array_equal(data["col1"], [[[0, -1, -1]], [[0, 1, -1]], [[0, 1, 2]]])
  86. np.testing.assert_array_equal(data["col2"], [[[100, -2, -2], [-2, -2, -2]], [[100, 101, -2], [-2, -2, -2]],
  87. [[100, 101, 102], [-2, -2, -2]]])
  88. def batch_padding_performance_3d():
  89. data1 = ds.Cifar10Dataset(CIFAR10_DIR, shuffle=False) # shape = [32,32,3]
  90. data1 = data1.repeat(24)
  91. pad_info = {"image": ([36, 36, 3], 0)}
  92. # pad_info = None
  93. data1 = data1.batch(batch_size=24, drop_remainder=True, pad_info=pad_info)
  94. start_time = time.time()
  95. num_batches = 0
  96. for _ in data1.create_dict_iterator(num_epochs=1):
  97. num_batches += 1
  98. _ = "total number of batch:" + str(num_batches) + " time elapsed:" + str(time.time() - start_time)
  99. # print(res)
  100. def batch_padding_performance_1d():
  101. data1 = ds.Cifar10Dataset(CIFAR10_DIR, shuffle=False) # shape = [32,32,3]
  102. data1 = data1.repeat(24)
  103. data1 = data1.map(operations=(lambda x: x.reshape(-1)), input_columns="image")
  104. pad_info = {"image": ([3888], 0)} # 3888 =36*36*3
  105. # pad_info = None
  106. data1 = data1.batch(batch_size=24, drop_remainder=True, pad_info=pad_info)
  107. start_time = time.time()
  108. num_batches = 0
  109. for _ in data1.create_dict_iterator(num_epochs=1):
  110. num_batches += 1
  111. _ = "total number of batch:" + str(num_batches) + " time elapsed:" + str(time.time() - start_time)
  112. # print(res)
  113. def batch_pyfunc_padding_3d():
  114. data1 = ds.Cifar10Dataset(CIFAR10_DIR, shuffle=False) # shape = [32,32,3]
  115. data1 = data1.repeat(24)
  116. # pad_info = {"image": ([36, 36, 3], 0)}
  117. data1 = data1.map(operations=(lambda x: np.pad(x, ((0, 4), (0, 4), (0, 0)))), input_columns="image",
  118. python_multiprocessing=False)
  119. data1 = data1.batch(batch_size=24, drop_remainder=True)
  120. start_time = time.time()
  121. num_batches = 0
  122. for _ in data1.create_dict_iterator(num_epochs=1):
  123. num_batches += 1
  124. _ = "total number of batch:" + str(num_batches) + " time elapsed:" + str(time.time() - start_time)
  125. # print(res)
  126. def batch_pyfunc_padding_1d():
  127. data1 = ds.Cifar10Dataset(CIFAR10_DIR, shuffle=False) # shape = [32,32,3]
  128. data1 = data1.repeat(24)
  129. data1 = data1.map(operations=(lambda x: x.reshape(-1)), input_columns="image")
  130. data1 = data1.map(operations=(lambda x: np.pad(x, (0, 816))), input_columns="image", python_multiprocessing=False)
  131. data1 = data1.batch(batch_size=24, drop_remainder=True)
  132. start_time = time.time()
  133. num_batches = 0
  134. for _ in data1.create_dict_iterator(num_epochs=1):
  135. num_batches += 1
  136. _ = "total number of batch:" + str(num_batches) + " time elapsed:" + str(time.time() - start_time)
  137. # print(res)
  138. def pad_map_config(my_num_workers=None, py_multiproc=False, my_max_rowsize=16):
  139. data1 = ds.Cifar10Dataset(CIFAR10_DIR, shuffle=False, num_samples=1000) # shape = [32,32,3]
  140. data1 = data1.map(operations=(lambda x: x.reshape(-1)), input_columns="image",
  141. num_parallel_workers=my_num_workers, python_multiprocessing=py_multiproc,
  142. max_rowsize=my_max_rowsize) # reshape to 1d
  143. data1 = data1.map(operations=(lambda x: np.pad(x, (0, 816))), input_columns="image",
  144. num_parallel_workers=my_num_workers, python_multiprocessing=py_multiproc,
  145. max_rowsize=my_max_rowsize)
  146. data1 = data1.batch(batch_size=25, drop_remainder=True)
  147. res = []
  148. for data in data1.create_dict_iterator(num_epochs=1, output_numpy=True):
  149. res.append(data["image"])
  150. return res
  151. def pad_batch_config():
  152. data2 = ds.Cifar10Dataset(CIFAR10_DIR, shuffle=False, num_samples=1000) # shape = [32,32,3]
  153. data2 = data2.map(operations=(lambda x: x.reshape(-1)), input_columns="image") # reshape to 1d
  154. data2 = data2.batch(batch_size=25, drop_remainder=True, pad_info={"image": ([3888], 0)})
  155. res = []
  156. for data in data2.create_dict_iterator(num_epochs=1, output_numpy=True):
  157. res.append(data["image"])
  158. return res
  159. def test_pad_via_map():
  160. """
  161. Feature: Batch Padding
  162. Description: Compare results for pad_batch versus numpy.pad
  163. Expectation: pad_batch and numpy.pad results are the same
  164. """
  165. res_from_map = pad_map_config()
  166. res_from_batch = pad_batch_config()
  167. assert len(res_from_batch) == len(res_from_batch)
  168. for i, _ in enumerate(res_from_map):
  169. np.testing.assert_array_equal(res_from_map[i], res_from_batch[i])
  170. def test_pad_via_map_multiproc():
  171. """
  172. Feature: Batch Padding
  173. Description: Compare results for pad_batch versus numpy.pad, with multiprocessing for map
  174. Expectation: pad_batch and numpy.pad results are the same
  175. """
  176. # Note: Reduce shared memory needed (for CI) by using small num_parallel_workers and max_rowsize values
  177. # and disabling the shared memory optimization
  178. mem_original = ds.config.get_enable_shared_mem()
  179. ds.config.set_enable_shared_mem(False)
  180. res_from_map = pad_map_config(2, True, 1)
  181. res_from_batch = pad_batch_config()
  182. assert len(res_from_batch) == len(res_from_batch)
  183. for i, _ in enumerate(res_from_map):
  184. np.testing.assert_array_equal(res_from_map[i], res_from_batch[i])
  185. ds.config.set_enable_shared_mem(mem_original)
  186. if __name__ == '__main__':
  187. test_batch_padding_01()
  188. test_batch_padding_02()
  189. test_batch_padding_03()
  190. test_batch_padding_04()
  191. test_batch_padding_05()
  192. # batch_padding_performance_3d()
  193. # batch_padding_performance_1d()
  194. # batch_pyfunc_padding_3d()
  195. # batch_pyfunc_padding_1d()
  196. test_pad_via_map()
  197. test_pad_via_map_multiproc()