Merge pull request !3183 from tony_liu2/mastertags/v0.7.0-beta
| @@ -99,8 +99,13 @@ def test_invalid_mindrecord(): | |||
| num_iter = 0 | |||
| for _ in data_set.create_dict_iterator(): | |||
| num_iter += 1 | |||
| assert num_iter == 0 | |||
| os.remove('dummy.mindrecord') | |||
| try: | |||
| assert num_iter == 0 | |||
| except Exception as error: | |||
| os.remove('dummy.mindrecord') | |||
| raise error | |||
| else: | |||
| os.remove('dummy.mindrecord') | |||
| def test_minddataset_lack_db(): | |||
| @@ -113,8 +118,13 @@ def test_minddataset_lack_db(): | |||
| num_iter = 0 | |||
| for _ in data_set.create_dict_iterator(): | |||
| num_iter += 1 | |||
| assert num_iter == 0 | |||
| os.remove(CV_FILE_NAME) | |||
| try: | |||
| assert num_iter == 0 | |||
| except Exception as error: | |||
| os.remove(CV_FILE_NAME) | |||
| raise error | |||
| else: | |||
| os.remove(CV_FILE_NAME) | |||
| def test_cv_minddataset_pk_sample_error_class_column(): | |||
| @@ -189,10 +199,16 @@ def test_minddataset_invalidate_num_shards(): | |||
| num_iter = 0 | |||
| for _ in data_set.create_dict_iterator(): | |||
| num_iter += 1 | |||
| assert 'Input shard_id is not within the required interval of (0 to 0).' in str(error_info.value) | |||
| try: | |||
| assert 'Input shard_id is not within the required interval of (0 to 0).' in str(error_info.value) | |||
| except Exception as error: | |||
| os.remove(CV_FILE_NAME) | |||
| os.remove("{}.db".format(CV_FILE_NAME)) | |||
| raise error | |||
| else: | |||
| os.remove(CV_FILE_NAME) | |||
| os.remove("{}.db".format(CV_FILE_NAME)) | |||
| os.remove(CV_FILE_NAME) | |||
| os.remove("{}.db".format(CV_FILE_NAME)) | |||
| def test_minddataset_invalidate_shard_id(): | |||
| create_cv_mindrecord(1) | |||
| @@ -203,9 +219,15 @@ def test_minddataset_invalidate_shard_id(): | |||
| num_iter = 0 | |||
| for _ in data_set.create_dict_iterator(): | |||
| num_iter += 1 | |||
| assert 'Input shard_id is not within the required interval of (0 to 0).' in str(error_info.value) | |||
| os.remove(CV_FILE_NAME) | |||
| os.remove("{}.db".format(CV_FILE_NAME)) | |||
| try: | |||
| assert 'Input shard_id is not within the required interval of (0 to 0).' in str(error_info.value) | |||
| except Exception as error: | |||
| os.remove(CV_FILE_NAME) | |||
| os.remove("{}.db".format(CV_FILE_NAME)) | |||
| raise error | |||
| else: | |||
| os.remove(CV_FILE_NAME) | |||
| os.remove("{}.db".format(CV_FILE_NAME)) | |||
| def test_minddataset_shard_id_bigger_than_num_shard(): | |||
| @@ -217,17 +239,28 @@ def test_minddataset_shard_id_bigger_than_num_shard(): | |||
| num_iter = 0 | |||
| for _ in data_set.create_dict_iterator(): | |||
| num_iter += 1 | |||
| assert 'Input shard_id is not within the required interval of (0 to 1).' in str(error_info.value) | |||
| try: | |||
| assert 'Input shard_id is not within the required interval of (0 to 1).' in str(error_info.value) | |||
| except Exception as error: | |||
| os.remove(CV_FILE_NAME) | |||
| os.remove("{}.db".format(CV_FILE_NAME)) | |||
| raise error | |||
| with pytest.raises(Exception) as error_info: | |||
| data_set = ds.MindDataset(CV_FILE_NAME, columns_list, num_readers, True, 2, 5) | |||
| num_iter = 0 | |||
| for _ in data_set.create_dict_iterator(): | |||
| num_iter += 1 | |||
| assert 'Input shard_id is not within the required interval of (0 to 1).' in str(error_info.value) | |||
| try: | |||
| assert 'Input shard_id is not within the required interval of (0 to 1).' in str(error_info.value) | |||
| except Exception as error: | |||
| os.remove(CV_FILE_NAME) | |||
| os.remove("{}.db".format(CV_FILE_NAME)) | |||
| raise error | |||
| else: | |||
| os.remove(CV_FILE_NAME) | |||
| os.remove("{}.db".format(CV_FILE_NAME)) | |||
| os.remove(CV_FILE_NAME) | |||
| os.remove("{}.db".format(CV_FILE_NAME)) | |||
| def test_cv_minddataset_partition_num_samples_equals_0(): | |||
| """tutorial for cv minddataset.""" | |||
| @@ -245,7 +278,26 @@ def test_cv_minddataset_partition_num_samples_equals_0(): | |||
| num_iter += 1 | |||
| with pytest.raises(Exception) as error_info: | |||
| partitions(5) | |||
| assert 'num_samples should be a positive integer value, but got num_samples=0' in str(error_info.value) | |||
| try: | |||
| assert 'num_samples should be a positive integer value, but got num_samples=0' in str(error_info.value) | |||
| except Exception as error: | |||
| os.remove(CV_FILE_NAME) | |||
| os.remove("{}.db".format(CV_FILE_NAME)) | |||
| raise error | |||
| else: | |||
| os.remove(CV_FILE_NAME) | |||
| os.remove("{}.db".format(CV_FILE_NAME)) | |||
| os.remove(CV_FILE_NAME) | |||
| os.remove("{}.db".format(CV_FILE_NAME)) | |||
| if __name__ == '__main__': | |||
| test_cv_lack_json() | |||
| test_cv_lack_mindrecord() | |||
| test_invalid_mindrecord() | |||
| test_minddataset_lack_db() | |||
| test_cv_minddataset_pk_sample_error_class_column() | |||
| test_cv_minddataset_pk_sample_exclusive_shuffle() | |||
| test_cv_minddataset_reader_different_schema() | |||
| test_cv_minddataset_reader_different_page_size() | |||
| test_minddataset_invalidate_num_shards() | |||
| test_minddataset_invalidate_shard_id() | |||
| test_minddataset_shard_id_bigger_than_num_shard() | |||
| test_cv_minddataset_partition_num_samples_equals_0() | |||
| @@ -27,54 +27,64 @@ CV_FILE_NAME = "./complex.mindrecord" | |||
| def test_cv_minddataset_reader_multi_image_and_ndarray_tutorial(): | |||
| writer = FileWriter(CV_FILE_NAME, FILES_NUM) | |||
| cv_schema_json = {"id": {"type": "int32"}, | |||
| "image_0": {"type": "bytes"}, | |||
| "image_2": {"type": "bytes"}, | |||
| "image_3": {"type": "bytes"}, | |||
| "image_4": {"type": "bytes"}, | |||
| "input_mask": {"type": "int32", "shape": [-1]}, | |||
| "segments": {"type": "float32", "shape": [2, 3]}} | |||
| writer.add_schema(cv_schema_json, "two_images_schema") | |||
| with open("../data/mindrecord/testImageNetData/images/image_00010.jpg", "rb") as file_reader: | |||
| img_data = file_reader.read() | |||
| ndarray_1 = np.array([1, 2, 3, 4, 5], np.int32) | |||
| ndarray_2 = np.array(([2, 3, 1], [7, 9, 0]), np.float32) | |||
| data = [] | |||
| for i in range(5): | |||
| item = {"id": i, "image_0": img_data, "image_2": img_data, "image_3": img_data, "image_4": img_data, | |||
| "input_mask": ndarray_1, "segments": ndarray_2} | |||
| data.append(item) | |||
| writer.write_raw_data(data) | |||
| writer.commit() | |||
| assert os.path.exists(CV_FILE_NAME) | |||
| assert os.path.exists(CV_FILE_NAME + ".db") | |||
| try: | |||
| writer = FileWriter(CV_FILE_NAME, FILES_NUM) | |||
| cv_schema_json = {"id": {"type": "int32"}, | |||
| "image_0": {"type": "bytes"}, | |||
| "image_2": {"type": "bytes"}, | |||
| "image_3": {"type": "bytes"}, | |||
| "image_4": {"type": "bytes"}, | |||
| "input_mask": {"type": "int32", "shape": [-1]}, | |||
| "segments": {"type": "float32", "shape": [2, 3]}} | |||
| writer.add_schema(cv_schema_json, "two_images_schema") | |||
| with open("../data/mindrecord/testImageNetData/images/image_00010.jpg", "rb") as file_reader: | |||
| img_data = file_reader.read() | |||
| ndarray_1 = np.array([1, 2, 3, 4, 5], np.int32) | |||
| ndarray_2 = np.array(([2, 3, 1], [7, 9, 0]), np.float32) | |||
| data = [] | |||
| for i in range(5): | |||
| item = {"id": i, "image_0": img_data, "image_2": img_data, "image_3": img_data, "image_4": img_data, | |||
| "input_mask": ndarray_1, "segments": ndarray_2} | |||
| data.append(item) | |||
| writer.write_raw_data(data) | |||
| writer.commit() | |||
| assert os.path.exists(CV_FILE_NAME) | |||
| assert os.path.exists(CV_FILE_NAME + ".db") | |||
| # tutorial for minderdataset. | |||
| columns_list = ["id", "image_0", "image_2", "image_3", "image_4", "input_mask", "segments"] | |||
| num_readers = 1 | |||
| data_set = ds.MindDataset(CV_FILE_NAME, columns_list, num_readers) | |||
| assert data_set.get_dataset_size() == 5 | |||
| num_iter = 0 | |||
| for item in data_set.create_dict_iterator(): | |||
| assert len(item) == 7 | |||
| logger.info("item: {}".format(item)) | |||
| assert item["image_0"].dtype == np.uint8 | |||
| assert (item["image_0"] == item["image_2"]).all() | |||
| assert (item["image_3"] == item["image_4"]).all() | |||
| assert (item["image_0"] == item["image_4"]).all() | |||
| assert item["image_2"].dtype == np.uint8 | |||
| assert item["image_3"].dtype == np.uint8 | |||
| assert item["image_4"].dtype == np.uint8 | |||
| assert item["id"].dtype == np.int32 | |||
| assert item["input_mask"].shape == (5,) | |||
| assert item["input_mask"].dtype == np.int32 | |||
| assert item["segments"].shape == (2, 3) | |||
| assert item["segments"].dtype == np.float32 | |||
| num_iter += 1 | |||
| assert num_iter == 5 | |||
| # tutorial for minderdataset. | |||
| columns_list = ["id", "image_0", "image_2", "image_3", "image_4", "input_mask", "segments"] | |||
| num_readers = 1 | |||
| data_set = ds.MindDataset(CV_FILE_NAME, columns_list, num_readers) | |||
| assert data_set.get_dataset_size() == 5 | |||
| num_iter = 0 | |||
| for item in data_set.create_dict_iterator(): | |||
| assert len(item) == 7 | |||
| logger.info("item: {}".format(item)) | |||
| assert item["image_0"].dtype == np.uint8 | |||
| assert (item["image_0"] == item["image_2"]).all() | |||
| assert (item["image_3"] == item["image_4"]).all() | |||
| assert (item["image_0"] == item["image_4"]).all() | |||
| assert item["image_2"].dtype == np.uint8 | |||
| assert item["image_3"].dtype == np.uint8 | |||
| assert item["image_4"].dtype == np.uint8 | |||
| assert item["id"].dtype == np.int32 | |||
| assert item["input_mask"].shape == (5,) | |||
| assert item["input_mask"].dtype == np.int32 | |||
| assert item["segments"].shape == (2, 3) | |||
| assert item["segments"].dtype == np.float32 | |||
| num_iter += 1 | |||
| assert num_iter == 5 | |||
| except Exception as error: | |||
| if os.path.exists("{}".format(CV_FILE_NAME + ".db")): | |||
| os.remove(CV_FILE_NAME + ".db") | |||
| if os.path.exists("{}".format(CV_FILE_NAME)): | |||
| os.remove(CV_FILE_NAME) | |||
| raise error | |||
| else: | |||
| if os.path.exists("{}".format(CV_FILE_NAME + ".db")): | |||
| os.remove(CV_FILE_NAME + ".db") | |||
| if os.path.exists("{}".format(CV_FILE_NAME)): | |||
| os.remove(CV_FILE_NAME) | |||
| if os.path.exists("{}".format(CV_FILE_NAME + ".db")): | |||
| os.remove(CV_FILE_NAME + ".db") | |||
| if os.path.exists("{}".format(CV_FILE_NAME)): | |||
| os.remove(CV_FILE_NAME) | |||
| if __name__ == '__main__': | |||
| test_cv_minddataset_reader_multi_image_and_ndarray_tutorial() | |||
| @@ -44,24 +44,31 @@ def add_and_remove_cv_file(): | |||
| """add/remove cv file""" | |||
| paths = ["{}{}".format(CV_FILE_NAME, str(x).rjust(1, '0')) | |||
| for x in range(FILES_NUM)] | |||
| for x in paths: | |||
| os.remove("{}".format(x)) if os.path.exists("{}".format(x)) else None | |||
| os.remove("{}.db".format(x)) if os.path.exists( | |||
| "{}.db".format(x)) else None | |||
| writer = FileWriter(CV_FILE_NAME, FILES_NUM) | |||
| data = get_data(CV_DIR_NAME) | |||
| cv_schema_json = {"id": {"type": "int32"}, | |||
| "file_name": {"type": "string"}, | |||
| "label": {"type": "int32"}, | |||
| "data": {"type": "bytes"}} | |||
| writer.add_schema(cv_schema_json, "img_schema") | |||
| writer.add_index(["file_name", "label"]) | |||
| writer.write_raw_data(data) | |||
| writer.commit() | |||
| yield "yield_cv_data" | |||
| for x in paths: | |||
| os.remove("{}".format(x)) | |||
| os.remove("{}.db".format(x)) | |||
| try: | |||
| for x in paths: | |||
| os.remove("{}".format(x)) if os.path.exists("{}".format(x)) else None | |||
| os.remove("{}.db".format(x)) if os.path.exists( | |||
| "{}.db".format(x)) else None | |||
| writer = FileWriter(CV_FILE_NAME, FILES_NUM) | |||
| data = get_data(CV_DIR_NAME) | |||
| cv_schema_json = {"id": {"type": "int32"}, | |||
| "file_name": {"type": "string"}, | |||
| "label": {"type": "int32"}, | |||
| "data": {"type": "bytes"}} | |||
| writer.add_schema(cv_schema_json, "img_schema") | |||
| writer.add_index(["file_name", "label"]) | |||
| writer.write_raw_data(data) | |||
| writer.commit() | |||
| yield "yield_cv_data" | |||
| except Exception as error: | |||
| for x in paths: | |||
| os.remove("{}".format(x)) | |||
| os.remove("{}.db".format(x)) | |||
| raise error | |||
| else: | |||
| for x in paths: | |||
| os.remove("{}".format(x)) | |||
| os.remove("{}.db".format(x)) | |||
| @pytest.fixture | |||
| @@ -69,32 +76,39 @@ def add_and_remove_nlp_file(): | |||
| """add/remove nlp file""" | |||
| paths = ["{}{}".format(NLP_FILE_NAME, str(x).rjust(1, '0')) | |||
| for x in range(FILES_NUM)] | |||
| for x in paths: | |||
| if os.path.exists("{}".format(x)): | |||
| try: | |||
| for x in paths: | |||
| if os.path.exists("{}".format(x)): | |||
| os.remove("{}".format(x)) | |||
| if os.path.exists("{}.db".format(x)): | |||
| os.remove("{}.db".format(x)) | |||
| writer = FileWriter(NLP_FILE_NAME, FILES_NUM) | |||
| data = [x for x in get_nlp_data(NLP_FILE_POS, NLP_FILE_VOCAB, 10)] | |||
| nlp_schema_json = {"id": {"type": "string"}, "label": {"type": "int32"}, | |||
| "rating": {"type": "float32"}, | |||
| "input_ids": {"type": "int64", | |||
| "shape": [-1]}, | |||
| "input_mask": {"type": "int64", | |||
| "shape": [1, -1]}, | |||
| "segment_ids": {"type": "int64", | |||
| "shape": [2, -1]} | |||
| } | |||
| writer.set_header_size(1 << 14) | |||
| writer.set_page_size(1 << 15) | |||
| writer.add_schema(nlp_schema_json, "nlp_schema") | |||
| writer.add_index(["id", "rating"]) | |||
| writer.write_raw_data(data) | |||
| writer.commit() | |||
| yield "yield_nlp_data" | |||
| except Exception as error: | |||
| for x in paths: | |||
| os.remove("{}".format(x)) | |||
| os.remove("{}.db".format(x)) | |||
| raise error | |||
| else: | |||
| for x in paths: | |||
| os.remove("{}".format(x)) | |||
| if os.path.exists("{}.db".format(x)): | |||
| os.remove("{}.db".format(x)) | |||
| writer = FileWriter(NLP_FILE_NAME, FILES_NUM) | |||
| data = [x for x in get_nlp_data(NLP_FILE_POS, NLP_FILE_VOCAB, 10)] | |||
| nlp_schema_json = {"id": {"type": "string"}, "label": {"type": "int32"}, | |||
| "rating": {"type": "float32"}, | |||
| "input_ids": {"type": "int64", | |||
| "shape": [-1]}, | |||
| "input_mask": {"type": "int64", | |||
| "shape": [1, -1]}, | |||
| "segment_ids": {"type": "int64", | |||
| "shape": [2, -1]} | |||
| } | |||
| writer.set_header_size(1 << 14) | |||
| writer.set_page_size(1 << 15) | |||
| writer.add_schema(nlp_schema_json, "nlp_schema") | |||
| writer.add_index(["id", "rating"]) | |||
| writer.write_raw_data(data) | |||
| writer.commit() | |||
| yield "yield_nlp_data" | |||
| for x in paths: | |||
| os.remove("{}".format(x)) | |||
| os.remove("{}.db".format(x)) | |||
| def test_cv_minddataset_reader_basic_padded_samples(add_and_remove_cv_file): | |||
| """tutorial for cv minderdataset.""" | |||
| @@ -119,7 +133,7 @@ def test_cv_minddataset_reader_basic_padded_samples(add_and_remove_cv_file): | |||
| encoding='utf8') | |||
| assert item['label'] == padded_sample['label'] | |||
| assert (item['data'] == np.array(list(padded_sample['data']))).all() | |||
| num_iter += 1 | |||
| num_iter += 1 | |||
| assert num_padded_iter == 5 | |||
| assert num_iter == 15 | |||
| @@ -636,3 +650,17 @@ def inputs(vectors, maxlen=50): | |||
| mask = [1] * length + [0] * (maxlen - length) | |||
| segment = [0] * maxlen | |||
| return input_, mask, segment | |||
| if __name__ == '__main__': | |||
| test_cv_minddataset_reader_basic_padded_samples(add_and_remove_cv_file) | |||
| test_cv_minddataset_partition_padded_samples(add_and_remove_cv_file) | |||
| test_cv_minddataset_partition_padded_samples_multi_epoch(add_and_remove_cv_file) | |||
| test_cv_minddataset_partition_padded_samples_no_dividsible(add_and_remove_cv_file) | |||
| test_cv_minddataset_partition_padded_samples_dataset_size_no_divisible(add_and_remove_cv_file) | |||
| test_cv_minddataset_partition_padded_samples_no_equal_column_list(add_and_remove_cv_file) | |||
| test_cv_minddataset_partition_padded_samples_no_column_list(add_and_remove_cv_file) | |||
| test_cv_minddataset_partition_padded_samples_no_num_padded(add_and_remove_cv_file) | |||
| test_cv_minddataset_partition_padded_samples_no_padded_samples(add_and_remove_cv_file) | |||
| test_nlp_minddataset_reader_basic_padded_samples(add_and_remove_nlp_file) | |||
| test_nlp_minddataset_reader_basic_padded_samples_multi_epoch(add_and_remove_nlp_file) | |||
| test_nlp_minddataset_reader_basic_padded_samples_check_whole_reshuffle_result_per_epoch(add_and_remove_nlp_file) | |||
| @@ -34,26 +34,32 @@ def add_and_remove_cv_file(): | |||
| """add/remove cv file""" | |||
| paths = ["{}{}".format(CV_FILE_NAME, str(x).rjust(1, '0')) | |||
| for x in range(FILES_NUM)] | |||
| for x in paths: | |||
| if os.path.exists("{}".format(x)): | |||
| try: | |||
| for x in paths: | |||
| if os.path.exists("{}".format(x)): | |||
| os.remove("{}".format(x)) | |||
| if os.path.exists("{}.db".format(x)): | |||
| os.remove("{}.db".format(x)) | |||
| writer = FileWriter(CV_FILE_NAME, FILES_NUM) | |||
| data = get_data(CV_DIR_NAME, True) | |||
| cv_schema_json = {"id": {"type": "int32"}, | |||
| "file_name": {"type": "string"}, | |||
| "label": {"type": "int32"}, | |||
| "data": {"type": "bytes"}} | |||
| writer.add_schema(cv_schema_json, "img_schema") | |||
| writer.add_index(["file_name", "label"]) | |||
| writer.write_raw_data(data) | |||
| writer.commit() | |||
| yield "yield_cv_data" | |||
| except Exception as error: | |||
| for x in paths: | |||
| os.remove("{}".format(x)) | |||
| os.remove("{}.db".format(x)) | |||
| raise error | |||
| else: | |||
| for x in paths: | |||
| os.remove("{}".format(x)) | |||
| if os.path.exists("{}.db".format(x)): | |||
| os.remove("{}.db".format(x)) | |||
| writer = FileWriter(CV_FILE_NAME, FILES_NUM) | |||
| data = get_data(CV_DIR_NAME, True) | |||
| cv_schema_json = {"id": {"type": "int32"}, | |||
| "file_name": {"type": "string"}, | |||
| "label": {"type": "int32"}, | |||
| "data": {"type": "bytes"}} | |||
| writer.add_schema(cv_schema_json, "img_schema") | |||
| writer.add_index(["file_name", "label"]) | |||
| writer.write_raw_data(data) | |||
| writer.commit() | |||
| yield "yield_cv_data" | |||
| for x in paths: | |||
| os.remove("{}".format(x)) | |||
| os.remove("{}.db".format(x)) | |||
| def test_cv_minddataset_pk_sample_no_column(add_and_remove_cv_file): | |||
| """tutorial for cv minderdataset.""" | |||
| @@ -626,3 +632,24 @@ def get_data(dir_name, sampler=False): | |||
| except FileNotFoundError: | |||
| continue | |||
| return data_list | |||
| if __name__ == '__main__': | |||
| test_cv_minddataset_pk_sample_no_column(add_and_remove_cv_file) | |||
| test_cv_minddataset_pk_sample_basic(add_and_remove_cv_file) | |||
| test_cv_minddataset_pk_sample_shuffle(add_and_remove_cv_file) | |||
| test_cv_minddataset_pk_sample_out_of_range(add_and_remove_cv_file) | |||
| test_cv_minddataset_subset_random_sample_basic(add_and_remove_cv_file) | |||
| test_cv_minddataset_subset_random_sample_replica(add_and_remove_cv_file) | |||
| test_cv_minddataset_subset_random_sample_empty(add_and_remove_cv_file) | |||
| test_cv_minddataset_subset_random_sample_out_of_range(add_and_remove_cv_file) | |||
| test_cv_minddataset_subset_random_sample_negative(add_and_remove_cv_file) | |||
| test_cv_minddataset_random_sampler_basic(add_and_remove_cv_file) | |||
| test_cv_minddataset_random_sampler_repeat(add_and_remove_cv_file) | |||
| test_cv_minddataset_random_sampler_replacement(add_and_remove_cv_file) | |||
| test_cv_minddataset_sequential_sampler_basic(add_and_remove_cv_file) | |||
| test_cv_minddataset_sequential_sampler_exceed_size(add_and_remove_cv_file) | |||
| test_cv_minddataset_split_basic(add_and_remove_cv_file) | |||
| test_cv_minddataset_split_exact_percent(add_and_remove_cv_file) | |||
| test_cv_minddataset_split_fuzzy_percent(add_and_remove_cv_file) | |||
| test_cv_minddataset_split_deterministic(add_and_remove_cv_file) | |||
| test_cv_minddataset_split_sharding(add_and_remove_cv_file) | |||