| @@ -14,7 +14,7 @@ | |||||
| # ============================================================================ | # ============================================================================ | ||||
| """generate dataloader and data processing entry""" | """generate dataloader and data processing entry""" | ||||
| import mindspore.dataset.engine as de | |||||
| import mindspore.dataset as ds | |||||
| from src.utils import DistributedSampler | from src.utils import DistributedSampler | ||||
| @@ -32,7 +32,7 @@ def GetDataLoader(per_batch_size, | |||||
| """ | """ | ||||
| centerface_gen = CenterfaceDataset(config=config, split=split) | centerface_gen = CenterfaceDataset(config=config, split=split) | ||||
| sampler = DistributedSampler(centerface_gen, rank, group_size, shuffle=(split == 'train')) # user defined sampling strategy | sampler = DistributedSampler(centerface_gen, rank, group_size, shuffle=(split == 'train')) # user defined sampling strategy | ||||
| de_dataset = de.GeneratorDataset(centerface_gen, ["image", "anns"], sampler=sampler, num_parallel_workers=16) | |||||
| de_dataset = ds.GeneratorDataset(centerface_gen, ["image", "anns"], sampler=sampler, num_parallel_workers=16) | |||||
| if group_size > 1: | if group_size > 1: | ||||
| num_parallel_workers = 24 | num_parallel_workers = 24 | ||||
| @@ -17,7 +17,7 @@ Data operations, will be used in train.py and eval.py | |||||
| """ | """ | ||||
| import os | import os | ||||
| import mindspore.dataset.engine as de | |||||
| import mindspore.dataset as ds | |||||
| import mindspore.dataset.vision.c_transforms as C | import mindspore.dataset.vision.c_transforms as C | ||||
| from src.dataset_utils import lucky, noise_blur, noise_speckle, noise_gamma, noise_gaussian, noise_salt_pepper, \ | from src.dataset_utils import lucky, noise_blur, noise_speckle, noise_gamma, noise_gaussian, noise_salt_pepper, \ | ||||
| shift_color, enhance_brightness, enhance_sharpness, enhance_contrast, enhance_color, gaussian_blur, \ | shift_color, enhance_brightness, enhance_sharpness, enhance_contrast, enhance_color, gaussian_blur, \ | ||||
| @@ -26,6 +26,7 @@ from src.dataset_utils import lucky, noise_blur, noise_speckle, noise_gamma, noi | |||||
| import cv2 | import cv2 | ||||
| import numpy as np | import numpy as np | ||||
| cv2.setNumThreads(0) | cv2.setNumThreads(0) | ||||
| image_height = None | image_height = None | ||||
| @@ -179,23 +180,24 @@ def create_dataset_train(mindrecord_file_pos, config): | |||||
| rank_id = int(os.getenv("RANK_ID", '0')) | rank_id = int(os.getenv("RANK_ID", '0')) | ||||
| decode = C.Decode() | decode = C.Decode() | ||||
| ds = de.MindDataset(mindrecord_file_pos, columns_list=["image", "label"], num_parallel_workers=4, | |||||
| num_shards=rank_size, shard_id=rank_id, shuffle=True) | |||||
| ds = ds.map(operations=decode, input_columns=["image"], num_parallel_workers=8) | |||||
| data_set = ds.MindDataset(mindrecord_file_pos, columns_list=["image", "label"], num_parallel_workers=4, | |||||
| num_shards=rank_size, shard_id=rank_id, shuffle=True) | |||||
| data_set = data_set.map(operations=decode, input_columns=["image"], num_parallel_workers=8) | |||||
| augmentor = Augmentor(config.augment_severity, config.augment_prob) | augmentor = Augmentor(config.augment_severity, config.augment_prob) | ||||
| operation = augmentor.process | operation = augmentor.process | ||||
| ds = ds.map(operations=operation, input_columns=["image"], | |||||
| num_parallel_workers=1, python_multiprocessing=True) | |||||
| data_set = data_set.map(operations=operation, input_columns=["image"], | |||||
| num_parallel_workers=1, python_multiprocessing=True) | |||||
| ##randomly augment half of samples to be negative samples | ##randomly augment half of samples to be negative samples | ||||
| ds = ds.map(operations=[random_neg_with_rotate, unify_img_label, transform_image], input_columns=["image", "label"], | |||||
| num_parallel_workers=8, python_multiprocessing=True) | |||||
| ##for training double the dataset to accoun for positive and negative | |||||
| ds = ds.repeat(2) | |||||
| data_set = data_set.map(operations=[random_neg_with_rotate, unify_img_label, transform_image], | |||||
| input_columns=["image", "label"], | |||||
| num_parallel_workers=8, python_multiprocessing=True) | |||||
| ##for training double the data_set to accoun for positive and negative | |||||
| data_set = data_set.repeat(2) | |||||
| # apply batch operations | # apply batch operations | ||||
| ds = ds.batch(config.batch_size, drop_remainder=True) | |||||
| return ds | |||||
| data_set = data_set.batch(config.batch_size, drop_remainder=True) | |||||
| return data_set | |||||
| def resize_image(img, label): | def resize_image(img, label): | ||||
| @@ -230,17 +232,18 @@ def create_dataset_eval(mindrecord_file_pos, config): | |||||
| rank_id = int(os.getenv("RANK_ID", '0')) | rank_id = int(os.getenv("RANK_ID", '0')) | ||||
| decode = C.Decode() | decode = C.Decode() | ||||
| ds = de.MindDataset(mindrecord_file_pos, columns_list=["image", "label"], num_parallel_workers=1, | |||||
| num_shards=rank_size, shard_id=rank_id, shuffle=False) | |||||
| ds = ds.map(operations=decode, input_columns=["image"], num_parallel_workers=8) | |||||
| data_set = ds.MindDataset(mindrecord_file_pos, columns_list=["image", "label"], num_parallel_workers=1, | |||||
| num_shards=rank_size, shard_id=rank_id, shuffle=False) | |||||
| data_set = data_set.map(operations=decode, input_columns=["image"], num_parallel_workers=8) | |||||
| global image_height | global image_height | ||||
| global image_width | global image_width | ||||
| image_height = config.im_size_h | image_height = config.im_size_h | ||||
| image_width = config.im_size_w | image_width = config.im_size_w | ||||
| ds = ds.map(operations=resize_image, input_columns=["image", "label"], num_parallel_workers=config.work_nums, | |||||
| python_multiprocessing=False) | |||||
| data_set = data_set.map(operations=resize_image, input_columns=["image", "label"], | |||||
| num_parallel_workers=config.work_nums, | |||||
| python_multiprocessing=False) | |||||
| # apply batch operations | # apply batch operations | ||||
| ds = ds.batch(1, drop_remainder=True) | |||||
| data_set = data_set.batch(1, drop_remainder=True) | |||||
| return ds | |||||
| return data_set | |||||
| @@ -16,7 +16,7 @@ | |||||
| import os | import os | ||||
| import numpy as np | import numpy as np | ||||
| import mindspore.common.dtype as mstype | import mindspore.common.dtype as mstype | ||||
| import mindspore.dataset.engine as de | |||||
| import mindspore.dataset as ds | |||||
| import mindspore.dataset.transforms.c_transforms as C | import mindspore.dataset.transforms.c_transforms as C | ||||
| import mindspore.dataset.vision.c_transforms as vc | import mindspore.dataset.vision.c_transforms as vc | ||||
| from PIL import Image, ImageFile | from PIL import Image, ImageFile | ||||
| @@ -105,7 +105,7 @@ def create_dataset(name, dataset_path, batch_size=1, num_shards=1, shard_id=0, i | |||||
| dataset = IIIT5KDataset(dataset_path, "annotation.txt", config) | dataset = IIIT5KDataset(dataset_path, "annotation.txt", config) | ||||
| else: | else: | ||||
| raise ValueError(f"unsupported dataset name: {name}") | raise ValueError(f"unsupported dataset name: {name}") | ||||
| ds = de.GeneratorDataset(dataset, ["image", "label"], shuffle=True, num_shards=num_shards, shard_id=shard_id) | |||||
| data_set = ds.GeneratorDataset(dataset, ["image", "label"], shuffle=True, num_shards=num_shards, shard_id=shard_id) | |||||
| image_trans = [ | image_trans = [ | ||||
| vc.Resize((config.image_height, config.image_width)), | vc.Resize((config.image_height, config.image_width)), | ||||
| vc.Normalize([127.5, 127.5, 127.5], std=[127.5, 127.5, 127.5]), | vc.Normalize([127.5, 127.5, 127.5], std=[127.5, 127.5, 127.5]), | ||||
| @@ -114,8 +114,8 @@ def create_dataset(name, dataset_path, batch_size=1, num_shards=1, shard_id=0, i | |||||
| label_trans = [ | label_trans = [ | ||||
| C.TypeCast(mstype.int32) | C.TypeCast(mstype.int32) | ||||
| ] | ] | ||||
| ds = ds.map(operations=image_trans, input_columns=["image"], num_parallel_workers=8) | |||||
| ds = ds.map(operations=label_trans, input_columns=["label"], num_parallel_workers=8) | |||||
| data_set = data_set.map(operations=image_trans, input_columns=["image"], num_parallel_workers=8) | |||||
| data_set = data_set.map(operations=label_trans, input_columns=["label"], num_parallel_workers=8) | |||||
| ds = ds.batch(batch_size, drop_remainder=True) | |||||
| return ds | |||||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||||
| return data_set | |||||
| @@ -16,7 +16,7 @@ | |||||
| Data operations, will be used in train.py and eval.py | Data operations, will be used in train.py and eval.py | ||||
| """ | """ | ||||
| import mindspore.common.dtype as mstype | import mindspore.common.dtype as mstype | ||||
| import mindspore.dataset.engine as de | |||||
| import mindspore.dataset as ds | |||||
| import mindspore.dataset.transforms.c_transforms as C2 | import mindspore.dataset.transforms.c_transforms as C2 | ||||
| import mindspore.dataset.vision.c_transforms as C | import mindspore.dataset.vision.c_transforms as C | ||||
| from src.config import config_gpu as cfg | from src.config import config_gpu as cfg | ||||
| @@ -37,33 +37,33 @@ def create_dataset(dataset_path, do_train, rank, group_size, repeat_num=1): | |||||
| dataset | dataset | ||||
| """ | """ | ||||
| if group_size == 1: | if group_size == 1: | ||||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=cfg.work_nums, shuffle=True) | |||||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=cfg.work_nums, shuffle=True) | |||||
| else: | else: | ||||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=cfg.work_nums, shuffle=True, | |||||
| num_shards=group_size, shard_id=rank) | |||||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=cfg.work_nums, shuffle=True, | |||||
| num_shards=group_size, shard_id=rank) | |||||
| # define map operations | # define map operations | ||||
| if do_train: | if do_train: | ||||
| trans = [ | trans = [ | ||||
| C.RandomCropDecodeResize(299, scale=(0.08, 1.0), ratio=(0.75, 1.333)), | C.RandomCropDecodeResize(299, scale=(0.08, 1.0), ratio=(0.75, 1.333)), | ||||
| C.RandomHorizontalFlip(prob=0.5), | C.RandomHorizontalFlip(prob=0.5), | ||||
| C.RandomColorAdjust(brightness=0.4, contrast=0.4, saturation=0.4) | C.RandomColorAdjust(brightness=0.4, contrast=0.4, saturation=0.4) | ||||
| ] | |||||
| ] | |||||
| else: | else: | ||||
| trans = [ | trans = [ | ||||
| C.Decode(), | C.Decode(), | ||||
| C.Resize(299), | C.Resize(299), | ||||
| C.CenterCrop(299) | C.CenterCrop(299) | ||||
| ] | |||||
| ] | |||||
| trans += [ | trans += [ | ||||
| C.Rescale(1.0 / 255.0, 0.0), | C.Rescale(1.0 / 255.0, 0.0), | ||||
| C.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), | C.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), | ||||
| C.HWC2CHW() | C.HWC2CHW() | ||||
| ] | ] | ||||
| type_cast_op = C2.TypeCast(mstype.int32) | type_cast_op = C2.TypeCast(mstype.int32) | ||||
| ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=cfg.work_nums) | |||||
| ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=cfg.work_nums) | |||||
| data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=cfg.work_nums) | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=cfg.work_nums) | |||||
| # apply batch operations | # apply batch operations | ||||
| ds = ds.batch(cfg.batch_size, drop_remainder=True) | |||||
| data_set = data_set.batch(cfg.batch_size, drop_remainder=True) | |||||
| # apply dataset repeat operation | # apply dataset repeat operation | ||||
| ds = ds.repeat(repeat_num) | |||||
| return ds | |||||
| data_set = data_set.repeat(repeat_num) | |||||
| return data_set | |||||
| @@ -17,7 +17,7 @@ create train or eval dataset. | |||||
| """ | """ | ||||
| import os | import os | ||||
| import mindspore.common.dtype as mstype | import mindspore.common.dtype as mstype | ||||
| import mindspore.dataset.engine as de | |||||
| import mindspore.dataset as ds | |||||
| import mindspore.dataset.vision.c_transforms as C | import mindspore.dataset.vision.c_transforms as C | ||||
| import mindspore.dataset.transforms.c_transforms as C2 | import mindspore.dataset.transforms.c_transforms as C2 | ||||
| from mindspore.communication.management import init, get_rank, get_group_size | from mindspore.communication.management import init, get_rank, get_group_size | ||||
| @@ -44,10 +44,10 @@ def create_dataset1(dataset_path, do_train, repeat_num=1, batch_size=32, target= | |||||
| device_num = get_group_size() | device_num = get_group_size() | ||||
| if device_num == 1: | if device_num == 1: | ||||
| ds = de.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||||
| data_set = ds.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||||
| else: | else: | ||||
| ds = de.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||||
| num_shards=device_num, shard_id=rank_id) | |||||
| data_set = ds.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||||
| num_shards=device_num, shard_id=rank_id) | |||||
| # define map operations | # define map operations | ||||
| trans = [] | trans = [] | ||||
| @@ -66,15 +66,15 @@ def create_dataset1(dataset_path, do_train, repeat_num=1, batch_size=32, target= | |||||
| type_cast_op = C2.TypeCast(mstype.int32) | type_cast_op = C2.TypeCast(mstype.int32) | ||||
| ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) | |||||
| ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8) | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) | |||||
| data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8) | |||||
| # apply batch operations | # apply batch operations | ||||
| ds = ds.batch(batch_size, drop_remainder=True) | |||||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||||
| # apply dataset repeat operation | # apply dataset repeat operation | ||||
| ds = ds.repeat(repeat_num) | |||||
| data_set = data_set.repeat(repeat_num) | |||||
| return ds | |||||
| return data_set | |||||
| def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend"): | def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend"): | ||||
| @@ -99,10 +99,10 @@ def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, target= | |||||
| device_num = get_group_size() | device_num = get_group_size() | ||||
| if device_num == 1: | if device_num == 1: | ||||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||||
| else: | else: | ||||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||||
| num_shards=device_num, shard_id=rank_id) | |||||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||||
| num_shards=device_num, shard_id=rank_id) | |||||
| image_size = 224 | image_size = 224 | ||||
| mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] | mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] | ||||
| @@ -127,16 +127,16 @@ def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, target= | |||||
| type_cast_op = C2.TypeCast(mstype.int32) | type_cast_op = C2.TypeCast(mstype.int32) | ||||
| ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8) | |||||
| ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) | |||||
| data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8) | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) | |||||
| # apply batch operations | # apply batch operations | ||||
| ds = ds.batch(batch_size, drop_remainder=True) | |||||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||||
| # apply dataset repeat operation | # apply dataset repeat operation | ||||
| ds = ds.repeat(repeat_num) | |||||
| data_set = data_set.repeat(repeat_num) | |||||
| return ds | |||||
| return data_set | |||||
| def _get_rank_info(): | def _get_rank_info(): | ||||
| @@ -21,7 +21,7 @@ import numpy as np | |||||
| from mindspore import Tensor | from mindspore import Tensor | ||||
| from mindspore.train.model import Model | from mindspore.train.model import Model | ||||
| import mindspore.common.dtype as mstype | import mindspore.common.dtype as mstype | ||||
| import mindspore.dataset.engine as de | |||||
| import mindspore.dataset as ds | |||||
| import mindspore.dataset.vision.c_transforms as C | import mindspore.dataset.vision.c_transforms as C | ||||
| import mindspore.dataset.transforms.c_transforms as C2 | import mindspore.dataset.transforms.c_transforms as C2 | ||||
| @@ -43,22 +43,22 @@ def create_dataset(dataset_path, do_train, config, repeat_num=1): | |||||
| rank_size = int(os.getenv("RANK_SIZE", '1')) | rank_size = int(os.getenv("RANK_SIZE", '1')) | ||||
| rank_id = int(os.getenv("RANK_ID", '0')) | rank_id = int(os.getenv("RANK_ID", '0')) | ||||
| if rank_size == 1: | if rank_size == 1: | ||||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||||
| else: | else: | ||||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||||
| num_shards=rank_size, shard_id=rank_id) | |||||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||||
| num_shards=rank_size, shard_id=rank_id) | |||||
| elif config.platform == "GPU": | elif config.platform == "GPU": | ||||
| if do_train: | if do_train: | ||||
| if config.run_distribute: | if config.run_distribute: | ||||
| from mindspore.communication.management import get_rank, get_group_size | from mindspore.communication.management import get_rank, get_group_size | ||||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||||
| num_shards=get_group_size(), shard_id=get_rank()) | |||||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||||
| num_shards=get_group_size(), shard_id=get_rank()) | |||||
| else: | else: | ||||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||||
| else: | else: | ||||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||||
| elif config.platform == "CPU": | elif config.platform == "CPU": | ||||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||||
| resize_height = config.image_height | resize_height = config.image_height | ||||
| resize_width = config.image_width | resize_width = config.image_width | ||||
| @@ -83,19 +83,19 @@ def create_dataset(dataset_path, do_train, config, repeat_num=1): | |||||
| type_cast_op = C2.TypeCast(mstype.int32) | type_cast_op = C2.TypeCast(mstype.int32) | ||||
| ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8) | |||||
| ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) | |||||
| data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8) | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) | |||||
| # apply shuffle operations | # apply shuffle operations | ||||
| ds = ds.shuffle(buffer_size=buffer_size) | |||||
| data_set = data_set.shuffle(buffer_size=buffer_size) | |||||
| # apply batch operations | # apply batch operations | ||||
| ds = ds.batch(config.batch_size, drop_remainder=True) | |||||
| data_set = data_set.batch(config.batch_size, drop_remainder=True) | |||||
| # apply dataset repeat operation | # apply dataset repeat operation | ||||
| ds = ds.repeat(repeat_num) | |||||
| data_set = data_set.repeat(repeat_num) | |||||
| return ds | |||||
| return data_set | |||||
| def extract_features(net, dataset_path, config): | def extract_features(net, dataset_path, config): | ||||
| @@ -121,5 +121,5 @@ def extract_features(net, dataset_path, config): | |||||
| features = model.predict(Tensor(image)) | features = model.predict(Tensor(image)) | ||||
| np.save(features_path, features.asnumpy()) | np.save(features_path, features.asnumpy()) | ||||
| np.save(label_path, label) | np.save(label_path, label) | ||||
| print(f"Complete the batch {i+1}/{step_size}") | |||||
| print(f"Complete the batch {i + 1}/{step_size}") | |||||
| return step_size | return step_size | ||||
| @@ -18,7 +18,7 @@ create train or eval dataset. | |||||
| import os | import os | ||||
| from functools import partial | from functools import partial | ||||
| import mindspore.common.dtype as mstype | import mindspore.common.dtype as mstype | ||||
| import mindspore.dataset.engine as de | |||||
| import mindspore.dataset as ds | |||||
| import mindspore.dataset.vision.c_transforms as C | import mindspore.dataset.vision.c_transforms as C | ||||
| import mindspore.dataset.transforms.c_transforms as C2 | import mindspore.dataset.transforms.c_transforms as C2 | ||||
| import mindspore.dataset.transforms.py_transforms as P2 | import mindspore.dataset.transforms.py_transforms as P2 | ||||
| @@ -43,24 +43,24 @@ def create_dataset(dataset_path, do_train, config, device_target, repeat_num=1, | |||||
| rank_id = int(os.getenv("RANK_ID")) | rank_id = int(os.getenv("RANK_ID")) | ||||
| columns_list = ['image', 'label'] | columns_list = ['image', 'label'] | ||||
| if config.data_load_mode == "mindrecord": | if config.data_load_mode == "mindrecord": | ||||
| load_func = partial(de.MindDataset, dataset_path, columns_list) | |||||
| load_func = partial(ds.MindDataset, dataset_path, columns_list) | |||||
| else: | else: | ||||
| load_func = partial(de.ImageFolderDataset, dataset_path) | |||||
| load_func = partial(ds.ImageFolderDataset, dataset_path) | |||||
| if do_train: | if do_train: | ||||
| if rank_size == 1: | if rank_size == 1: | ||||
| ds = load_func(num_parallel_workers=8, shuffle=True) | |||||
| data_set = load_func(num_parallel_workers=8, shuffle=True) | |||||
| else: | else: | ||||
| ds = load_func(num_parallel_workers=8, shuffle=True, | |||||
| num_shards=rank_size, shard_id=rank_id) | |||||
| data_set = load_func(num_parallel_workers=8, shuffle=True, | |||||
| num_shards=rank_size, shard_id=rank_id) | |||||
| else: | else: | ||||
| ds = load_func(num_parallel_workers=8, shuffle=False) | |||||
| data_set = load_func(num_parallel_workers=8, shuffle=False) | |||||
| elif device_target == "GPU": | elif device_target == "GPU": | ||||
| if do_train: | if do_train: | ||||
| from mindspore.communication.management import get_rank, get_group_size | from mindspore.communication.management import get_rank, get_group_size | ||||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||||
| num_shards=get_group_size(), shard_id=get_rank()) | |||||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||||
| num_shards=get_group_size(), shard_id=get_rank()) | |||||
| else: | else: | ||||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||||
| else: | else: | ||||
| raise ValueError("Unsupported device_target.") | raise ValueError("Unsupported device_target.") | ||||
| @@ -69,7 +69,7 @@ def create_dataset(dataset_path, do_train, config, device_target, repeat_num=1, | |||||
| if do_train: | if do_train: | ||||
| buffer_size = 20480 | buffer_size = 20480 | ||||
| # apply shuffle operations | # apply shuffle operations | ||||
| ds = ds.shuffle(buffer_size=buffer_size) | |||||
| data_set = data_set.shuffle(buffer_size=buffer_size) | |||||
| # define map operations | # define map operations | ||||
| decode_op = C.Decode() | decode_op = C.Decode() | ||||
| @@ -89,16 +89,16 @@ def create_dataset(dataset_path, do_train, config, device_target, repeat_num=1, | |||||
| type_cast_op = C2.TypeCast(mstype.int32) | type_cast_op = C2.TypeCast(mstype.int32) | ||||
| ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=16) | |||||
| ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) | |||||
| data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=16) | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) | |||||
| # apply batch operations | # apply batch operations | ||||
| ds = ds.batch(batch_size, drop_remainder=True) | |||||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||||
| # apply dataset repeat operation | # apply dataset repeat operation | ||||
| ds = ds.repeat(repeat_num) | |||||
| data_set = data_set.repeat(repeat_num) | |||||
| return ds | |||||
| return data_set | |||||
| def create_dataset_py(dataset_path, do_train, config, device_target, repeat_num=1, batch_size=32): | def create_dataset_py(dataset_path, do_train, config, device_target, repeat_num=1, batch_size=32): | ||||
| @@ -119,12 +119,12 @@ def create_dataset_py(dataset_path, do_train, config, device_target, repeat_num= | |||||
| rank_id = int(os.getenv("RANK_ID")) | rank_id = int(os.getenv("RANK_ID")) | ||||
| if do_train: | if do_train: | ||||
| if rank_size == 1: | if rank_size == 1: | ||||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||||
| else: | else: | ||||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||||
| num_shards=rank_size, shard_id=rank_id) | |||||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||||
| num_shards=rank_size, shard_id=rank_id) | |||||
| else: | else: | ||||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=False) | |||||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=False) | |||||
| else: | else: | ||||
| raise ValueError("Unsupported device target.") | raise ValueError("Unsupported device target.") | ||||
| @@ -133,7 +133,7 @@ def create_dataset_py(dataset_path, do_train, config, device_target, repeat_num= | |||||
| if do_train: | if do_train: | ||||
| buffer_size = 20480 | buffer_size = 20480 | ||||
| # apply shuffle operations | # apply shuffle operations | ||||
| ds = ds.shuffle(buffer_size=buffer_size) | |||||
| data_set = data_set.shuffle(buffer_size=buffer_size) | |||||
| # define map operations | # define map operations | ||||
| decode_op = P.Decode() | decode_op = P.Decode() | ||||
| @@ -152,12 +152,13 @@ def create_dataset_py(dataset_path, do_train, config, device_target, repeat_num= | |||||
| compose = P2.Compose(trans) | compose = P2.Compose(trans) | ||||
| ds = ds.map(operations=compose, input_columns="image", num_parallel_workers=8, python_multiprocessing=True) | |||||
| data_set = data_set.map(operations=compose, input_columns="image", num_parallel_workers=8, | |||||
| python_multiprocessing=True) | |||||
| # apply batch operations | # apply batch operations | ||||
| ds = ds.batch(batch_size, drop_remainder=True) | |||||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||||
| # apply dataset repeat operation | # apply dataset repeat operation | ||||
| ds = ds.repeat(repeat_num) | |||||
| data_set = data_set.repeat(repeat_num) | |||||
| return ds | |||||
| return data_set | |||||
| @@ -16,7 +16,7 @@ | |||||
| create train or eval dataset. | create train or eval dataset. | ||||
| """ | """ | ||||
| import mindspore.common.dtype as mstype | import mindspore.common.dtype as mstype | ||||
| import mindspore.dataset.engine as de | |||||
| import mindspore.dataset as ds | |||||
| import mindspore.dataset.vision.c_transforms as C | import mindspore.dataset.vision.c_transforms as C | ||||
| import mindspore.dataset.transforms.c_transforms as C2 | import mindspore.dataset.transforms.c_transforms as C2 | ||||
| @@ -38,12 +38,12 @@ def create_dataset(dataset_path, do_train, config, device_target, repeat_num=1, | |||||
| if do_train: | if do_train: | ||||
| if run_distribute: | if run_distribute: | ||||
| from mindspore.communication.management import get_rank, get_group_size | from mindspore.communication.management import get_rank, get_group_size | ||||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||||
| num_shards=get_group_size(), shard_id=get_rank()) | |||||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||||
| num_shards=get_group_size(), shard_id=get_rank()) | |||||
| else: | else: | ||||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||||
| else: | else: | ||||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||||
| else: | else: | ||||
| raise ValueError("Unsupported device_target.") | raise ValueError("Unsupported device_target.") | ||||
| @@ -70,16 +70,16 @@ def create_dataset(dataset_path, do_train, config, device_target, repeat_num=1, | |||||
| type_cast_op = C2.TypeCast(mstype.int32) | type_cast_op = C2.TypeCast(mstype.int32) | ||||
| ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8) | |||||
| ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) | |||||
| data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8) | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) | |||||
| # apply shuffle operations | # apply shuffle operations | ||||
| ds = ds.shuffle(buffer_size=buffer_size) | |||||
| data_set = data_set.shuffle(buffer_size=buffer_size) | |||||
| # apply batch operations | # apply batch operations | ||||
| ds = ds.batch(batch_size, drop_remainder=True) | |||||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||||
| # apply dataset repeat operation | # apply dataset repeat operation | ||||
| ds = ds.repeat(repeat_num) | |||||
| data_set = data_set.repeat(repeat_num) | |||||
| return ds | |||||
| return data_set | |||||
| @@ -16,7 +16,7 @@ | |||||
| Data operations, will be used in train.py and eval.py | Data operations, will be used in train.py and eval.py | ||||
| """ | """ | ||||
| import mindspore.common.dtype as mstype | import mindspore.common.dtype as mstype | ||||
| import mindspore.dataset.engine as de | |||||
| import mindspore.dataset as ds | |||||
| import mindspore.dataset.transforms.c_transforms as C2 | import mindspore.dataset.transforms.c_transforms as C2 | ||||
| import mindspore.dataset.vision.c_transforms as C | import mindspore.dataset.vision.c_transforms as C | ||||
| @@ -37,10 +37,10 @@ def create_dataset(dataset_path, config, do_train, repeat_num=1): | |||||
| rank = config.rank | rank = config.rank | ||||
| group_size = config.group_size | group_size = config.group_size | ||||
| if group_size == 1: | if group_size == 1: | ||||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=config.work_nums, shuffle=True) | |||||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=config.work_nums, shuffle=True) | |||||
| else: | else: | ||||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=config.work_nums, shuffle=True, | |||||
| num_shards=group_size, shard_id=rank) | |||||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=config.work_nums, shuffle=True, | |||||
| num_shards=group_size, shard_id=rank) | |||||
| # define map operations | # define map operations | ||||
| if do_train: | if do_train: | ||||
| trans = [ | trans = [ | ||||
| @@ -60,10 +60,10 @@ def create_dataset(dataset_path, config, do_train, repeat_num=1): | |||||
| C.HWC2CHW() | C.HWC2CHW() | ||||
| ] | ] | ||||
| type_cast_op = C2.TypeCast(mstype.int32) | type_cast_op = C2.TypeCast(mstype.int32) | ||||
| ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=config.work_nums) | |||||
| ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=config.work_nums) | |||||
| data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=config.work_nums) | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=config.work_nums) | |||||
| # apply batch operations | # apply batch operations | ||||
| ds = ds.batch(config.batch_size, drop_remainder=True) | |||||
| data_set = data_set.batch(config.batch_size, drop_remainder=True) | |||||
| # apply dataset repeat operation | # apply dataset repeat operation | ||||
| ds = ds.repeat(repeat_num) | |||||
| return ds | |||||
| data_set = data_set.repeat(repeat_num) | |||||
| return data_set | |||||
| @@ -25,21 +25,24 @@ import pyclipper | |||||
| from PIL import Image | from PIL import Image | ||||
| from src.config import config | from src.config import config | ||||
| import mindspore.dataset.engine as de | |||||
| import mindspore.dataset as ds | |||||
| import mindspore.dataset.vision.py_transforms as py_transforms | import mindspore.dataset.vision.py_transforms as py_transforms | ||||
| __all__ = ['train_dataset_creator', 'test_dataset_creator'] | __all__ = ['train_dataset_creator', 'test_dataset_creator'] | ||||
| def get_img(img_path): | def get_img(img_path): | ||||
| img = cv2.imread(img_path) | img = cv2.imread(img_path) | ||||
| img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) | img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) | ||||
| return img | return img | ||||
| def get_imgs_names(root_dir): | def get_imgs_names(root_dir): | ||||
| img_paths = [i for i in os.listdir(root_dir) | img_paths = [i for i in os.listdir(root_dir) | ||||
| if os.path.splitext(i)[-1].lower() in ['.jpg', '.jpeg', '.png']] | if os.path.splitext(i)[-1].lower() in ['.jpg', '.jpeg', '.png']] | ||||
| return img_paths | return img_paths | ||||
| def get_bboxes(img, gt_path): | def get_bboxes(img, gt_path): | ||||
| h, w = img.shape[0:2] | h, w = img.shape[0:2] | ||||
| with open(gt_path, 'r', encoding='utf-8-sig') as f: | with open(gt_path, 'r', encoding='utf-8-sig') as f: | ||||
| @@ -58,6 +61,7 @@ def get_bboxes(img, gt_path): | |||||
| tags.append(tag) | tags.append(tag) | ||||
| return np.array(bboxes), tags | return np.array(bboxes), tags | ||||
| def random_scale(img, min_size): | def random_scale(img, min_size): | ||||
| h, w = img.shape[0:2] | h, w = img.shape[0:2] | ||||
| if max(h, w) > 1280: | if max(h, w) > 1280: | ||||
| @@ -74,12 +78,14 @@ def random_scale(img, min_size): | |||||
| img = cv2.resize(img, dsize=None, fx=scale2, fy=scale2) | img = cv2.resize(img, dsize=None, fx=scale2, fy=scale2) | ||||
| return img | return img | ||||
| def random_horizontal_flip(imgs): | def random_horizontal_flip(imgs): | ||||
| if random.random() < 0.5: | if random.random() < 0.5: | ||||
| for i, _ in enumerate(imgs): | for i, _ in enumerate(imgs): | ||||
| imgs[i] = np.flip(imgs[i], axis=1).copy() | imgs[i] = np.flip(imgs[i], axis=1).copy() | ||||
| return imgs | return imgs | ||||
| def random_rotate(imgs): | def random_rotate(imgs): | ||||
| max_angle = 10 | max_angle = 10 | ||||
| angle = random.random() * 2 * max_angle - max_angle | angle = random.random() * 2 * max_angle - max_angle | ||||
| @@ -91,6 +97,7 @@ def random_rotate(imgs): | |||||
| imgs[i] = img_rotation | imgs[i] = img_rotation | ||||
| return imgs | return imgs | ||||
| def random_crop(imgs, img_size): | def random_crop(imgs, img_size): | ||||
| h, w = imgs[0].shape[0:2] | h, w = imgs[0].shape[0:2] | ||||
| th, tw = img_size | th, tw = img_size | ||||
| @@ -118,21 +125,25 @@ def random_crop(imgs, img_size): | |||||
| imgs[idx] = imgs[idx][i:i + th, j:j + tw] | imgs[idx] = imgs[idx][i:i + th, j:j + tw] | ||||
| return imgs | return imgs | ||||
| def scale(img, long_size=2240): | def scale(img, long_size=2240): | ||||
| h, w = img.shape[0:2] | h, w = img.shape[0:2] | ||||
| scale_long = long_size * 1.0 / max(h, w) | scale_long = long_size * 1.0 / max(h, w) | ||||
| img = cv2.resize(img, dsize=None, fx=scale_long, fy=scale_long) | img = cv2.resize(img, dsize=None, fx=scale_long, fy=scale_long) | ||||
| return img | return img | ||||
| def dist(a, b): | def dist(a, b): | ||||
| return np.sqrt(np.sum((a - b) ** 2)) | return np.sqrt(np.sum((a - b) ** 2)) | ||||
| def perimeter(bbox): | def perimeter(bbox): | ||||
| peri = 0.0 | peri = 0.0 | ||||
| for i in range(bbox.shape[0]): | for i in range(bbox.shape[0]): | ||||
| peri += dist(bbox[i], bbox[(i + 1) % bbox.shape[0]]) | peri += dist(bbox[i], bbox[(i + 1) % bbox.shape[0]]) | ||||
| return peri | return peri | ||||
| def shrink(bboxes, rate, max_shr=20): | def shrink(bboxes, rate, max_shr=20): | ||||
| rate = rate * rate | rate = rate * rate | ||||
| shrinked_bboxes = [] | shrinked_bboxes = [] | ||||
| @@ -158,6 +169,7 @@ def shrink(bboxes, rate, max_shr=20): | |||||
| return np.array(shrinked_bboxes) | return np.array(shrinked_bboxes) | ||||
| class TrainDataset: | class TrainDataset: | ||||
| def __init__(self): | def __init__(self): | ||||
| self.is_transform = True | self.is_transform = True | ||||
| @@ -260,6 +272,7 @@ class TrainDataset: | |||||
| def __len__(self): | def __len__(self): | ||||
| return len(self.all_img_paths) | return len(self.all_img_paths) | ||||
| def IC15_TEST_Generator(): | def IC15_TEST_Generator(): | ||||
| ic15_test_data_dir = config.TEST_ROOT_DIR + 'ch4_test_images/' | ic15_test_data_dir = config.TEST_ROOT_DIR + 'ch4_test_images/' | ||||
| img_size = config.INFER_LONG_SIZE | img_size = config.INFER_LONG_SIZE | ||||
| @@ -298,6 +311,7 @@ def IC15_TEST_Generator(): | |||||
| yield img, img_resized, img_name | yield img, img_resized, img_name | ||||
| class DistributedSampler(): | class DistributedSampler(): | ||||
| def __init__(self, dataset, rank, group_size, shuffle=True, seed=0): | def __init__(self, dataset, rank, group_size, shuffle=True, seed=0): | ||||
| self.dataset = dataset | self.dataset = dataset | ||||
| @@ -324,18 +338,20 @@ class DistributedSampler(): | |||||
| def __len__(self): | def __len__(self): | ||||
| return self.num_samplers | return self.num_samplers | ||||
| def train_dataset_creator(rank, group_size, shuffle=True): | def train_dataset_creator(rank, group_size, shuffle=True): | ||||
| cv2.setNumThreads(0) | cv2.setNumThreads(0) | ||||
| dataset = TrainDataset() | dataset = TrainDataset() | ||||
| sampler = DistributedSampler(dataset, rank, group_size, shuffle) | sampler = DistributedSampler(dataset, rank, group_size, shuffle) | ||||
| ds = de.GeneratorDataset(dataset, ['img', 'gt_text', 'gt_kernels', 'training_mask'], num_parallel_workers=8, | |||||
| sampler=sampler) | |||||
| ds = ds.repeat(1) | |||||
| ds = ds.batch(config.TRAIN_BATCH_SIZE, drop_remainder=config.TRAIN_DROP_REMAINDER) | |||||
| return ds | |||||
| data_set = ds.GeneratorDataset(dataset, ['img', 'gt_text', 'gt_kernels', 'training_mask'], num_parallel_workers=8, | |||||
| sampler=sampler) | |||||
| data_set = data_set.repeat(1) | |||||
| data_set = data_set.batch(config.TRAIN_BATCH_SIZE, drop_remainder=config.TRAIN_DROP_REMAINDER) | |||||
| return data_set | |||||
| def test_dataset_creator(): | def test_dataset_creator(): | ||||
| ds = de.GeneratorDataset(IC15_TEST_Generator, ['img', 'img_resized', 'img_name']) | |||||
| ds = ds.shuffle(config.TEST_BUFFER_SIZE) | |||||
| ds = ds.batch(1, drop_remainder=config.TEST_DROP_REMAINDER) | |||||
| return ds | |||||
| data_set = ds.GeneratorDataset(IC15_TEST_Generator, ['img', 'img_resized', 'img_name']) | |||||
| data_set = data_set.shuffle(config.TEST_BUFFER_SIZE) | |||||
| data_set = data_set.batch(1, drop_remainder=config.TEST_DROP_REMAINDER) | |||||
| return data_set | |||||
| @@ -29,7 +29,7 @@ from mindspore.train.serialization import load_checkpoint, load_param_into_net | |||||
| from mindspore.common import set_seed | from mindspore.common import set_seed | ||||
| import mindspore.nn as nn | import mindspore.nn as nn | ||||
| import mindspore.common.initializer as weight_init | import mindspore.common.initializer as weight_init | ||||
| import mindspore.dataset.engine as de | |||||
| import mindspore.dataset as ds | |||||
| import mindspore.dataset.vision.c_transforms as C | import mindspore.dataset.vision.c_transforms as C | ||||
| from src.resnet_gpu_benchmark import resnet50 as resnet | from src.resnet_gpu_benchmark import resnet50 as resnet | ||||
| from src.CrossEntropySmooth import CrossEntropySmooth | from src.CrossEntropySmooth import CrossEntropySmooth | ||||
| @@ -45,19 +45,22 @@ parser.add_argument('--dataset_path', type=str, default=None, help='Imagenet dat | |||||
| parser.add_argument('--ckpt_path', type=str, default="./", help='The path to save ckpt if save_ckpt is True;\ | parser.add_argument('--ckpt_path', type=str, default="./", help='The path to save ckpt if save_ckpt is True;\ | ||||
| Or the ckpt model file when eval is True') | Or the ckpt model file when eval is True') | ||||
| parser.add_argument('--mode', type=str, default="GRAPH", choices=["GRAPH", "PYNATIVE"], help='Execute mode') | parser.add_argument('--mode', type=str, default="GRAPH", choices=["GRAPH", "PYNATIVE"], help='Execute mode') | ||||
| parser.add_argument('--dtype', type=str, choices=["fp32", "fp16", "FP16", "FP32"], default="fp16",\ | |||||
| help='Compute data type fp32 or fp16: default fp16') | |||||
| parser.add_argument('--dtype', type=str, choices=["fp32", "fp16", "FP16", "FP32"], default="fp16", \ | |||||
| help='Compute data type fp32 or fp16: default fp16') | |||||
| args_opt = parser.parse_args() | args_opt = parser.parse_args() | ||||
| set_seed(1) | set_seed(1) | ||||
| class MyTimeMonitor(Callback): | class MyTimeMonitor(Callback): | ||||
| def __init__(self, batch_size, sink_size): | def __init__(self, batch_size, sink_size): | ||||
| super(MyTimeMonitor, self).__init__() | super(MyTimeMonitor, self).__init__() | ||||
| self.batch_size = batch_size | self.batch_size = batch_size | ||||
| self.size = sink_size | self.size = sink_size | ||||
| def step_begin(self, run_context): | def step_begin(self, run_context): | ||||
| self.step_time = time.time() | self.step_time = time.time() | ||||
| def step_end(self, run_context): | def step_end(self, run_context): | ||||
| cb_params = run_context.original_args() | cb_params = run_context.original_args() | ||||
| loss = cb_params.net_outputs | loss = cb_params.net_outputs | ||||
| @@ -75,17 +78,18 @@ class MyTimeMonitor(Callback): | |||||
| raise ValueError("epoch: {} step: {}. Invalid loss, terminating training.".format( | raise ValueError("epoch: {} step: {}. Invalid loss, terminating training.".format( | ||||
| cb_params.cur_epoch_num, cur_step_in_epoch)) | cb_params.cur_epoch_num, cur_step_in_epoch)) | ||||
| step_mseconds = (time.time() - self.step_time) * 1000 | step_mseconds = (time.time() - self.step_time) * 1000 | ||||
| fps = self.batch_size / step_mseconds *1000 * self.size | |||||
| fps = self.batch_size / step_mseconds * 1000 * self.size | |||||
| print("epoch: %s step: %s, loss is %s" % (cb_params.cur_epoch_num, cur_step_in_epoch, loss), | print("epoch: %s step: %s, loss is %s" % (cb_params.cur_epoch_num, cur_step_in_epoch, loss), | ||||
| "Epoch time: {:5.3f} ms, fps: {:d} img/sec.".format(step_mseconds, int(fps)), flush=True) | "Epoch time: {:5.3f} ms, fps: {:d} img/sec.".format(step_mseconds, int(fps)), flush=True) | ||||
| def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target="GPU", dtype="fp16", | def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target="GPU", dtype="fp16", | ||||
| device_num=1): | device_num=1): | ||||
| if device_num == 1: | if device_num == 1: | ||||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=4, shuffle=True) | |||||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=4, shuffle=True) | |||||
| else: | else: | ||||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=4, shuffle=True, | |||||
| num_shards=device_num, shard_id=get_rank()) | |||||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=4, shuffle=True, | |||||
| num_shards=device_num, shard_id=get_rank()) | |||||
| image_size = 224 | image_size = 224 | ||||
| mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] | mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] | ||||
| std = [0.229 * 255, 0.224 * 255, 0.225 * 255] | std = [0.229 * 255, 0.224 * 255, 0.225 * 255] | ||||
| @@ -113,14 +117,15 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target=" | |||||
| ] | ] | ||||
| if dtype == "fp32": | if dtype == "fp32": | ||||
| trans.append(C.HWC2CHW()) | trans.append(C.HWC2CHW()) | ||||
| ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8) | |||||
| data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8) | |||||
| # apply batch operations | # apply batch operations | ||||
| ds = ds.batch(batch_size, drop_remainder=True) | |||||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||||
| # apply dataset repeat operation | # apply dataset repeat operation | ||||
| if repeat_num > 1: | if repeat_num > 1: | ||||
| ds = ds.repeat(repeat_num) | |||||
| data_set = data_set.repeat(repeat_num) | |||||
| return data_set | |||||
| return ds | |||||
| def get_liner_lr(lr_init, lr_end, lr_max, warmup_epochs, total_epochs, steps_per_epoch): | def get_liner_lr(lr_init, lr_end, lr_max, warmup_epochs, total_epochs, steps_per_epoch): | ||||
| lr_each_step = [] | lr_each_step = [] | ||||
| @@ -136,6 +141,7 @@ def get_liner_lr(lr_init, lr_end, lr_max, warmup_epochs, total_epochs, steps_per | |||||
| lr_each_step = np.array(lr_each_step).astype(np.float32) | lr_each_step = np.array(lr_each_step).astype(np.float32) | ||||
| return lr_each_step | return lr_each_step | ||||
| def train(): | def train(): | ||||
| # set args | # set args | ||||
| dev = "GPU" | dev = "GPU" | ||||
| @@ -221,6 +227,7 @@ def train(): | |||||
| else: | else: | ||||
| model.train(epoch_size, dataset, callbacks=cb) | model.train(epoch_size, dataset, callbacks=cb) | ||||
| def eval_(): | def eval_(): | ||||
| # set args | # set args | ||||
| dev = "GPU" | dev = "GPU" | ||||
| @@ -251,6 +258,7 @@ def eval_(): | |||||
| res = model.eval(dataset) | res = model.eval(dataset) | ||||
| print("result:", res, "ckpt=", ckpt_dir) | print("result:", res, "ckpt=", ckpt_dir) | ||||
| if __name__ == '__main__': | if __name__ == '__main__': | ||||
| if not args_opt.eval: | if not args_opt.eval: | ||||
| train() | train() | ||||
| @@ -17,7 +17,7 @@ create train or eval dataset. | |||||
| """ | """ | ||||
| import os | import os | ||||
| import mindspore.common.dtype as mstype | import mindspore.common.dtype as mstype | ||||
| import mindspore.dataset.engine as de | |||||
| import mindspore.dataset as ds | |||||
| import mindspore.dataset.vision.c_transforms as C | import mindspore.dataset.vision.c_transforms as C | ||||
| import mindspore.dataset.transforms.c_transforms as C2 | import mindspore.dataset.transforms.c_transforms as C2 | ||||
| from mindspore.communication.management import init, get_rank, get_group_size | from mindspore.communication.management import init, get_rank, get_group_size | ||||
| @@ -47,10 +47,10 @@ def create_dataset1(dataset_path, do_train, repeat_num=1, batch_size=32, target= | |||||
| else: | else: | ||||
| device_num = 1 | device_num = 1 | ||||
| if device_num == 1: | if device_num == 1: | ||||
| ds = de.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||||
| data_set = ds.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||||
| else: | else: | ||||
| ds = de.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||||
| num_shards=device_num, shard_id=rank_id) | |||||
| data_set = ds.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||||
| num_shards=device_num, shard_id=rank_id) | |||||
| # define map operations | # define map operations | ||||
| trans = [] | trans = [] | ||||
| @@ -69,15 +69,15 @@ def create_dataset1(dataset_path, do_train, repeat_num=1, batch_size=32, target= | |||||
| type_cast_op = C2.TypeCast(mstype.int32) | type_cast_op = C2.TypeCast(mstype.int32) | ||||
| ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) | |||||
| ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8) | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) | |||||
| data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8) | |||||
| # apply batch operations | # apply batch operations | ||||
| ds = ds.batch(batch_size, drop_remainder=True) | |||||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||||
| # apply dataset repeat operation | # apply dataset repeat operation | ||||
| ds = ds.repeat(repeat_num) | |||||
| data_set = data_set.repeat(repeat_num) | |||||
| return ds | |||||
| return data_set | |||||
| def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend", distribute=False): | def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend", distribute=False): | ||||
| @@ -106,10 +106,10 @@ def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, target= | |||||
| device_num = 1 | device_num = 1 | ||||
| if device_num == 1: | if device_num == 1: | ||||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||||
| else: | else: | ||||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||||
| num_shards=device_num, shard_id=rank_id) | |||||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||||
| num_shards=device_num, shard_id=rank_id) | |||||
| image_size = 224 | image_size = 224 | ||||
| mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] | mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] | ||||
| @@ -134,16 +134,16 @@ def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, target= | |||||
| type_cast_op = C2.TypeCast(mstype.int32) | type_cast_op = C2.TypeCast(mstype.int32) | ||||
| ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8) | |||||
| ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) | |||||
| data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8) | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) | |||||
| # apply batch operations | # apply batch operations | ||||
| ds = ds.batch(batch_size, drop_remainder=True) | |||||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||||
| # apply dataset repeat operation | # apply dataset repeat operation | ||||
| ds = ds.repeat(repeat_num) | |||||
| data_set = data_set.repeat(repeat_num) | |||||
| return ds | |||||
| return data_set | |||||
| def create_dataset3(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend", distribute=False): | def create_dataset3(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend", distribute=False): | ||||
| @@ -171,10 +171,10 @@ def create_dataset3(dataset_path, do_train, repeat_num=1, batch_size=32, target= | |||||
| device_num = 1 | device_num = 1 | ||||
| rank_id = 1 | rank_id = 1 | ||||
| if device_num == 1: | if device_num == 1: | ||||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||||
| else: | else: | ||||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||||
| num_shards=device_num, shard_id=rank_id) | |||||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||||
| num_shards=device_num, shard_id=rank_id) | |||||
| image_size = 224 | image_size = 224 | ||||
| mean = [0.475 * 255, 0.451 * 255, 0.392 * 255] | mean = [0.475 * 255, 0.451 * 255, 0.392 * 255] | ||||
| std = [0.275 * 255, 0.267 * 255, 0.278 * 255] | std = [0.275 * 255, 0.267 * 255, 0.278 * 255] | ||||
| @@ -198,15 +198,15 @@ def create_dataset3(dataset_path, do_train, repeat_num=1, batch_size=32, target= | |||||
| type_cast_op = C2.TypeCast(mstype.int32) | type_cast_op = C2.TypeCast(mstype.int32) | ||||
| ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8) | |||||
| ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) | |||||
| data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8) | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) | |||||
| # apply batch operations | # apply batch operations | ||||
| ds = ds.batch(batch_size, drop_remainder=True) | |||||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||||
| # apply dataset repeat operation | # apply dataset repeat operation | ||||
| ds = ds.repeat(repeat_num) | |||||
| data_set = data_set.repeat(repeat_num) | |||||
| return ds | |||||
| return data_set | |||||
| def create_dataset4(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend", distribute=False): | def create_dataset4(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend", distribute=False): | ||||
| @@ -234,10 +234,10 @@ def create_dataset4(dataset_path, do_train, repeat_num=1, batch_size=32, target= | |||||
| else: | else: | ||||
| device_num = 1 | device_num = 1 | ||||
| if device_num == 1: | if device_num == 1: | ||||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=12, shuffle=True) | |||||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=12, shuffle=True) | |||||
| else: | else: | ||||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=12, shuffle=True, | |||||
| num_shards=device_num, shard_id=rank_id) | |||||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=12, shuffle=True, | |||||
| num_shards=device_num, shard_id=rank_id) | |||||
| image_size = 224 | image_size = 224 | ||||
| mean = [123.68, 116.78, 103.94] | mean = [123.68, 116.78, 103.94] | ||||
| std = [1.0, 1.0, 1.0] | std = [1.0, 1.0, 1.0] | ||||
| @@ -260,16 +260,16 @@ def create_dataset4(dataset_path, do_train, repeat_num=1, batch_size=32, target= | |||||
| ] | ] | ||||
| type_cast_op = C2.TypeCast(mstype.int32) | type_cast_op = C2.TypeCast(mstype.int32) | ||||
| ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=12) | |||||
| ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=12) | |||||
| data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=12) | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=12) | |||||
| # apply batch operations | # apply batch operations | ||||
| ds = ds.batch(batch_size, drop_remainder=True) | |||||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||||
| # apply dataset repeat operation | # apply dataset repeat operation | ||||
| ds = ds.repeat(repeat_num) | |||||
| data_set = data_set.repeat(repeat_num) | |||||
| return ds | |||||
| return data_set | |||||
| def _get_rank_info(): | def _get_rank_info(): | ||||
| @@ -18,7 +18,7 @@ create train or eval dataset. | |||||
| import os | import os | ||||
| from functools import partial | from functools import partial | ||||
| import mindspore.common.dtype as mstype | import mindspore.common.dtype as mstype | ||||
| import mindspore.dataset.engine as de | |||||
| import mindspore.dataset as ds | |||||
| import mindspore.dataset.transforms.c_transforms as C2 | import mindspore.dataset.transforms.c_transforms as C2 | ||||
| import mindspore.dataset.vision.c_transforms as C | import mindspore.dataset.vision.c_transforms as C | ||||
| import mindspore.dataset.transforms.py_transforms as P2 | import mindspore.dataset.transforms.py_transforms as P2 | ||||
| @@ -53,14 +53,14 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target=" | |||||
| columns_list = ['image', 'label'] | columns_list = ['image', 'label'] | ||||
| if config.data_load_mode == "mindrecord": | if config.data_load_mode == "mindrecord": | ||||
| load_func = partial(de.MindDataset, dataset_path, columns_list) | |||||
| load_func = partial(ds.MindDataset, dataset_path, columns_list) | |||||
| else: | else: | ||||
| load_func = partial(de.ImageFolderDataset, dataset_path) | |||||
| load_func = partial(ds.ImageFolderDataset, dataset_path) | |||||
| if device_num == 1: | if device_num == 1: | ||||
| ds = load_func(num_parallel_workers=8, shuffle=True) | |||||
| data_set = load_func(num_parallel_workers=8, shuffle=True) | |||||
| else: | else: | ||||
| ds = load_func(num_parallel_workers=8, shuffle=True, | |||||
| num_shards=device_num, shard_id=rank_id) | |||||
| data_set = load_func(num_parallel_workers=8, shuffle=True, | |||||
| num_shards=device_num, shard_id=rank_id) | |||||
| image_size = 224 | image_size = 224 | ||||
| mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] | mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] | ||||
| @@ -85,16 +85,16 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target=" | |||||
| type_cast_op = C2.TypeCast(mstype.int32) | type_cast_op = C2.TypeCast(mstype.int32) | ||||
| ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8) | |||||
| ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) | |||||
| data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8) | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) | |||||
| # apply batch operations | # apply batch operations | ||||
| ds = ds.batch(batch_size, drop_remainder=True) | |||||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||||
| # apply dataset repeat operation | # apply dataset repeat operation | ||||
| ds = ds.repeat(repeat_num) | |||||
| data_set = data_set.repeat(repeat_num) | |||||
| return ds | |||||
| return data_set | |||||
| def create_dataset_py(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend"): | def create_dataset_py(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend"): | ||||
| @@ -121,12 +121,12 @@ def create_dataset_py(dataset_path, do_train, repeat_num=1, batch_size=32, targe | |||||
| if do_train: | if do_train: | ||||
| if device_num == 1: | if device_num == 1: | ||||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||||
| else: | else: | ||||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||||
| num_shards=device_num, shard_id=rank_id) | |||||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||||
| num_shards=device_num, shard_id=rank_id) | |||||
| else: | else: | ||||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=False) | |||||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=False) | |||||
| image_size = 224 | image_size = 224 | ||||
| @@ -147,12 +147,13 @@ def create_dataset_py(dataset_path, do_train, repeat_num=1, batch_size=32, targe | |||||
| trans = [decode_op, resize_op, center_crop, to_tensor, normalize_op] | trans = [decode_op, resize_op, center_crop, to_tensor, normalize_op] | ||||
| compose = P2.Compose(trans) | compose = P2.Compose(trans) | ||||
| ds = ds.map(operations=compose, input_columns="image", num_parallel_workers=8, python_multiprocessing=True) | |||||
| data_set = data_set.map(operations=compose, input_columns="image", num_parallel_workers=8, | |||||
| python_multiprocessing=True) | |||||
| # apply batch operations | # apply batch operations | ||||
| ds = ds.batch(batch_size, drop_remainder=True) | |||||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||||
| # apply dataset repeat operation | # apply dataset repeat operation | ||||
| ds = ds.repeat(repeat_num) | |||||
| data_set = data_set.repeat(repeat_num) | |||||
| return ds | |||||
| return data_set | |||||
| @@ -17,7 +17,7 @@ create train or eval dataset. | |||||
| """ | """ | ||||
| import os | import os | ||||
| import mindspore.common.dtype as mstype | import mindspore.common.dtype as mstype | ||||
| import mindspore.dataset.engine as de | |||||
| import mindspore.dataset as ds | |||||
| import mindspore.dataset.vision.c_transforms as C | import mindspore.dataset.vision.c_transforms as C | ||||
| import mindspore.dataset.transforms.c_transforms as C2 | import mindspore.dataset.transforms.c_transforms as C2 | ||||
| from mindspore.communication.management import init, get_rank, get_group_size | from mindspore.communication.management import init, get_rank, get_group_size | ||||
| @@ -47,10 +47,10 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target=" | |||||
| num_parallels = 4 | num_parallels = 4 | ||||
| if device_num == 1: | if device_num == 1: | ||||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=num_parallels, shuffle=True) | |||||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=num_parallels, shuffle=True) | |||||
| else: | else: | ||||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=num_parallels, shuffle=True, | |||||
| num_shards=device_num, shard_id=rank_id) | |||||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=num_parallels, shuffle=True, | |||||
| num_shards=device_num, shard_id=rank_id) | |||||
| image_size = 224 | image_size = 224 | ||||
| mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] | mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] | ||||
| @@ -75,16 +75,16 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target=" | |||||
| type_cast_op = C2.TypeCast(mstype.int32) | type_cast_op = C2.TypeCast(mstype.int32) | ||||
| ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=num_parallels) | |||||
| ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=num_parallels) | |||||
| data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=num_parallels) | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=num_parallels) | |||||
| # apply batch operations | # apply batch operations | ||||
| ds = ds.batch(batch_size, drop_remainder=True) | |||||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||||
| # apply dataset repeat operation | # apply dataset repeat operation | ||||
| ds = ds.repeat(repeat_num) | |||||
| data_set = data_set.repeat(repeat_num) | |||||
| return ds | |||||
| return data_set | |||||
| def _get_rank_info(): | def _get_rank_info(): | ||||
| @@ -15,7 +15,7 @@ | |||||
| """Data operations, will be used in train.py and eval.py""" | """Data operations, will be used in train.py and eval.py""" | ||||
| from src.config import config | from src.config import config | ||||
| import mindspore.common.dtype as mstype | import mindspore.common.dtype as mstype | ||||
| import mindspore.dataset.engine as de | |||||
| import mindspore.dataset as ds | |||||
| import mindspore.dataset.transforms.c_transforms as C2 | import mindspore.dataset.transforms.c_transforms as C2 | ||||
| import mindspore.dataset.vision.c_transforms as C | import mindspore.dataset.vision.c_transforms as C | ||||
| @@ -36,10 +36,10 @@ def create_dataset(dataset_path, do_train, device_num=1, rank=0): | |||||
| """ | """ | ||||
| if device_num == 1: | if device_num == 1: | ||||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||||
| else: | else: | ||||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||||
| num_shards=device_num, shard_id=rank) | |||||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||||
| num_shards=device_num, shard_id=rank) | |||||
| # define map operations | # define map operations | ||||
| if do_train: | if do_train: | ||||
| trans = [ | trans = [ | ||||
| @@ -59,8 +59,8 @@ def create_dataset(dataset_path, do_train, device_num=1, rank=0): | |||||
| ] | ] | ||||
| type_cast_op = C2.TypeCast(mstype.int32) | type_cast_op = C2.TypeCast(mstype.int32) | ||||
| ds = ds.map(input_columns="image", operations=trans, num_parallel_workers=8) | |||||
| ds = ds.map(input_columns="label", operations=type_cast_op, num_parallel_workers=8) | |||||
| data_set = data_set.map(input_columns="image", operations=trans, num_parallel_workers=8) | |||||
| data_set = data_set.map(input_columns="label", operations=type_cast_op, num_parallel_workers=8) | |||||
| # apply batch operations | # apply batch operations | ||||
| ds = ds.batch(config.batch_size, drop_remainder=True) | |||||
| return ds | |||||
| data_set = data_set.batch(config.batch_size, drop_remainder=True) | |||||
| return data_set | |||||
| @@ -19,7 +19,7 @@ import numpy as np | |||||
| from src.config import config_gpu as cfg | from src.config import config_gpu as cfg | ||||
| import mindspore.common.dtype as mstype | import mindspore.common.dtype as mstype | ||||
| import mindspore.dataset.engine as de | |||||
| import mindspore.dataset as ds | |||||
| import mindspore.dataset.transforms.c_transforms as C2 | import mindspore.dataset.transforms.c_transforms as C2 | ||||
| import mindspore.dataset.vision.c_transforms as C | import mindspore.dataset.vision.c_transforms as C | ||||
| @@ -46,10 +46,10 @@ def create_dataset(dataset_path, do_train, rank, group_size, repeat_num=1): | |||||
| dataset | dataset | ||||
| """ | """ | ||||
| if group_size == 1: | if group_size == 1: | ||||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=cfg.work_nums, shuffle=True) | |||||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=cfg.work_nums, shuffle=True) | |||||
| else: | else: | ||||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=cfg.work_nums, shuffle=True, | |||||
| num_shards=group_size, shard_id=rank) | |||||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=cfg.work_nums, shuffle=True, | |||||
| num_shards=group_size, shard_id=rank) | |||||
| # define map operations | # define map operations | ||||
| if do_train: | if do_train: | ||||
| trans = [ | trans = [ | ||||
| @@ -71,9 +71,9 @@ def create_dataset(dataset_path, do_train, rank, group_size, repeat_num=1): | |||||
| ] | ] | ||||
| type_cast_op = C2.TypeCast(mstype.int32) | type_cast_op = C2.TypeCast(mstype.int32) | ||||
| ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=cfg.work_nums) | |||||
| ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=cfg.work_nums) | |||||
| data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=cfg.work_nums) | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=cfg.work_nums) | |||||
| # apply batch operations | # apply batch operations | ||||
| ds = ds.batch(cfg.batch_size, drop_remainder=True) | |||||
| data_set = data_set.batch(cfg.batch_size, drop_remainder=True) | |||||
| return ds | |||||
| return data_set | |||||
| @@ -17,7 +17,7 @@ create train or eval dataset. | |||||
| """ | """ | ||||
| import os | import os | ||||
| import mindspore.common.dtype as mstype | import mindspore.common.dtype as mstype | ||||
| import mindspore.dataset.engine as de | |||||
| import mindspore.dataset as ds | |||||
| import mindspore.dataset.vision.c_transforms as C | import mindspore.dataset.vision.c_transforms as C | ||||
| import mindspore.dataset.transforms.c_transforms as C2 | import mindspore.dataset.transforms.c_transforms as C2 | ||||
| from mindspore.communication.management import init, get_rank, get_group_size | from mindspore.communication.management import init, get_rank, get_group_size | ||||
| @@ -48,15 +48,15 @@ def create_dataset_cifar(dataset_path, | |||||
| device_num = get_group_size() | device_num = get_group_size() | ||||
| if device_num == 1: | if device_num == 1: | ||||
| ds = de.Cifar10Dataset(dataset_path, | |||||
| num_parallel_workers=8, | |||||
| shuffle=True) | |||||
| data_set = ds.Cifar10Dataset(dataset_path, | |||||
| num_parallel_workers=8, | |||||
| shuffle=True) | |||||
| else: | else: | ||||
| ds = de.Cifar10Dataset(dataset_path, | |||||
| num_parallel_workers=8, | |||||
| shuffle=True, | |||||
| num_shards=device_num, | |||||
| shard_id=rank_id) | |||||
| data_set = ds.Cifar10Dataset(dataset_path, | |||||
| num_parallel_workers=8, | |||||
| shuffle=True, | |||||
| num_shards=device_num, | |||||
| shard_id=rank_id) | |||||
| # define map operations | # define map operations | ||||
| if do_train: | if do_train: | ||||
| @@ -80,20 +80,20 @@ def create_dataset_cifar(dataset_path, | |||||
| type_cast_op = C2.TypeCast(mstype.int32) | type_cast_op = C2.TypeCast(mstype.int32) | ||||
| ds = ds.map(operations=type_cast_op, | |||||
| input_columns="label", | |||||
| num_parallel_workers=8) | |||||
| ds = ds.map(operations=trans, | |||||
| input_columns="image", | |||||
| num_parallel_workers=8) | |||||
| data_set = data_set.map(operations=type_cast_op, | |||||
| input_columns="label", | |||||
| num_parallel_workers=8) | |||||
| data_set = data_set.map(operations=trans, | |||||
| input_columns="image", | |||||
| num_parallel_workers=8) | |||||
| # apply batch operations | # apply batch operations | ||||
| ds = ds.batch(batch_size, drop_remainder=True) | |||||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||||
| # apply dataset repeat operation | # apply dataset repeat operation | ||||
| ds = ds.repeat(repeat_num) | |||||
| data_set = data_set.repeat(repeat_num) | |||||
| return ds | |||||
| return data_set | |||||
| def create_dataset_imagenet(dataset_path, | def create_dataset_imagenet(dataset_path, | ||||
| @@ -122,15 +122,15 @@ def create_dataset_imagenet(dataset_path, | |||||
| device_num = get_group_size() | device_num = get_group_size() | ||||
| if device_num == 1: | if device_num == 1: | ||||
| ds = de.ImageFolderDataset(dataset_path, | |||||
| num_parallel_workers=8, | |||||
| shuffle=True) | |||||
| data_set = ds.ImageFolderDataset(dataset_path, | |||||
| num_parallel_workers=8, | |||||
| shuffle=True) | |||||
| else: | else: | ||||
| ds = de.ImageFolderDataset(dataset_path, | |||||
| num_parallel_workers=8, | |||||
| shuffle=True, | |||||
| num_shards=device_num, | |||||
| shard_id=rank_id) | |||||
| data_set = ds.ImageFolderDataset(dataset_path, | |||||
| num_parallel_workers=8, | |||||
| shuffle=True, | |||||
| num_shards=device_num, | |||||
| shard_id=rank_id) | |||||
| image_size = 227 | image_size = 227 | ||||
| mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] | mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] | ||||
| @@ -159,20 +159,20 @@ def create_dataset_imagenet(dataset_path, | |||||
| type_cast_op = C2.TypeCast(mstype.int32) | type_cast_op = C2.TypeCast(mstype.int32) | ||||
| ds = ds.map(operations=type_cast_op, | |||||
| input_columns="label", | |||||
| num_parallel_workers=8) | |||||
| ds = ds.map(operations=trans, | |||||
| input_columns="image", | |||||
| num_parallel_workers=8) | |||||
| data_set = data_set.map(operations=type_cast_op, | |||||
| input_columns="label", | |||||
| num_parallel_workers=8) | |||||
| data_set = data_set.map(operations=trans, | |||||
| input_columns="image", | |||||
| num_parallel_workers=8) | |||||
| # apply batch operations | # apply batch operations | ||||
| ds = ds.batch(batch_size, drop_remainder=True) | |||||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||||
| # apply dataset repeat operation | # apply dataset repeat operation | ||||
| ds = ds.repeat(repeat_num) | |||||
| data_set = data_set.repeat(repeat_num) | |||||
| return ds | |||||
| return data_set | |||||
| def _get_rank_info(): | def _get_rank_info(): | ||||
| @@ -17,7 +17,7 @@ import os | |||||
| import math as m | import math as m | ||||
| import numpy as np | import numpy as np | ||||
| import mindspore.common.dtype as mstype | import mindspore.common.dtype as mstype | ||||
| import mindspore.dataset.engine as de | |||||
| import mindspore.dataset as ds | |||||
| import mindspore.dataset.transforms.c_transforms as c | import mindspore.dataset.transforms.c_transforms as c | ||||
| import mindspore.dataset.vision.c_transforms as vc | import mindspore.dataset.vision.c_transforms as vc | ||||
| from PIL import Image | from PIL import Image | ||||
| @@ -86,7 +86,7 @@ def create_dataset(dataset_path, batch_size=1, num_shards=1, shard_id=0, device_ | |||||
| """ | """ | ||||
| dataset = _CaptchaDataset(dataset_path, cf.max_captcha_digits, device_target) | dataset = _CaptchaDataset(dataset_path, cf.max_captcha_digits, device_target) | ||||
| ds = de.GeneratorDataset(dataset, ["image", "label"], shuffle=True, num_shards=num_shards, shard_id=shard_id) | |||||
| data_set = ds.GeneratorDataset(dataset, ["image", "label"], shuffle=True, num_shards=num_shards, shard_id=shard_id) | |||||
| image_trans = [ | image_trans = [ | ||||
| vc.Rescale(1.0 / 255.0, 0.0), | vc.Rescale(1.0 / 255.0, 0.0), | ||||
| vc.Normalize([0.9010, 0.9049, 0.9025], std=[0.1521, 0.1347, 0.1458]), | vc.Normalize([0.9010, 0.9049, 0.9025], std=[0.1521, 0.1347, 0.1458]), | ||||
| @@ -96,12 +96,12 @@ def create_dataset(dataset_path, batch_size=1, num_shards=1, shard_id=0, device_ | |||||
| label_trans = [ | label_trans = [ | ||||
| c.TypeCast(mstype.int32) | c.TypeCast(mstype.int32) | ||||
| ] | ] | ||||
| ds = ds.map(operations=image_trans, input_columns=["image"], num_parallel_workers=8) | |||||
| data_set = data_set.map(operations=image_trans, input_columns=["image"], num_parallel_workers=8) | |||||
| if device_target == 'Ascend': | if device_target == 'Ascend': | ||||
| ds = ds.map(operations=transpose_hwc2whc, input_columns=["image"], num_parallel_workers=8) | |||||
| data_set = data_set.map(operations=transpose_hwc2whc, input_columns=["image"], num_parallel_workers=8) | |||||
| else: | else: | ||||
| ds = ds.map(operations=transpose_hwc2chw, input_columns=["image"], num_parallel_workers=8) | |||||
| ds = ds.map(operations=label_trans, input_columns=["label"], num_parallel_workers=8) | |||||
| data_set = data_set.map(operations=transpose_hwc2chw, input_columns=["image"], num_parallel_workers=8) | |||||
| data_set = data_set.map(operations=label_trans, input_columns=["label"], num_parallel_workers=8) | |||||
| ds = ds.batch(batch_size, drop_remainder=True) | |||||
| return ds | |||||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||||
| return data_set | |||||
| @@ -16,10 +16,11 @@ | |||||
| Data operations, will be used in train.py and eval.py | Data operations, will be used in train.py and eval.py | ||||
| """ | """ | ||||
| import mindspore.common.dtype as mstype | import mindspore.common.dtype as mstype | ||||
| import mindspore.dataset.engine as de | |||||
| import mindspore.dataset as ds | |||||
| import mindspore.dataset.transforms.c_transforms as C2 | import mindspore.dataset.transforms.c_transforms as C2 | ||||
| import mindspore.dataset.vision.c_transforms as C | import mindspore.dataset.vision.c_transforms as C | ||||
| def create_dataset(dataset_path, do_train, batch_size=16, device_num=1, rank=0): | def create_dataset(dataset_path, do_train, batch_size=16, device_num=1, rank=0): | ||||
| """ | """ | ||||
| create a train or eval dataset | create a train or eval dataset | ||||
| @@ -35,10 +36,10 @@ def create_dataset(dataset_path, do_train, batch_size=16, device_num=1, rank=0): | |||||
| dataset | dataset | ||||
| """ | """ | ||||
| if device_num == 1: | if device_num == 1: | ||||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||||
| else: | else: | ||||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||||
| num_shards=device_num, shard_id=rank) | |||||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||||
| num_shards=device_num, shard_id=rank) | |||||
| # define map operations | # define map operations | ||||
| if do_train: | if do_train: | ||||
| trans = [ | trans = [ | ||||
| @@ -59,8 +60,8 @@ def create_dataset(dataset_path, do_train, batch_size=16, device_num=1, rank=0): | |||||
| ] | ] | ||||
| type_cast_op = C2.TypeCast(mstype.int32) | type_cast_op = C2.TypeCast(mstype.int32) | ||||
| ds = ds.map(input_columns="image", operations=trans, num_parallel_workers=8) | |||||
| ds = ds.map(input_columns="label", operations=type_cast_op, num_parallel_workers=8) | |||||
| data_set = data_set.map(input_columns="image", operations=trans, num_parallel_workers=8) | |||||
| data_set = data_set.map(input_columns="label", operations=type_cast_op, num_parallel_workers=8) | |||||
| # apply batch operations | # apply batch operations | ||||
| ds = ds.batch(batch_size, drop_remainder=True) | |||||
| return ds | |||||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||||
| return data_set | |||||
| @@ -17,7 +17,7 @@ Data operations, will be used in run_pretrain.py | |||||
| """ | """ | ||||
| import os | import os | ||||
| import mindspore.common.dtype as mstype | import mindspore.common.dtype as mstype | ||||
| import mindspore.dataset.engine.datasets as de | |||||
| import mindspore.dataset as ds | |||||
| import mindspore.dataset.transforms.c_transforms as C | import mindspore.dataset.transforms.c_transforms as C | ||||
| from mindspore import log as logger | from mindspore import log as logger | ||||
| from .config import cfg | from .config import cfg | ||||
| @@ -31,65 +31,67 @@ def create_bert_dataset(device_num=1, rank=0, do_shuffle="true", data_dir=None, | |||||
| for file_name in files: | for file_name in files: | ||||
| if "tfrecord" in file_name: | if "tfrecord" in file_name: | ||||
| data_files.append(os.path.join(data_dir, file_name)) | data_files.append(os.path.join(data_dir, file_name)) | ||||
| ds = de.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None, | |||||
| columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", | |||||
| "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"], | |||||
| shuffle=de.Shuffle.FILES if do_shuffle == "true" else False, | |||||
| num_shards=device_num, shard_id=rank, shard_equal_rows=True) | |||||
| ori_dataset_size = ds.get_dataset_size() | |||||
| data_set = ds.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None, | |||||
| columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", | |||||
| "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"], | |||||
| shuffle=ds.Shuffle.FILES if do_shuffle == "true" else False, | |||||
| num_shards=device_num, shard_id=rank, shard_equal_rows=True) | |||||
| ori_dataset_size = data_set.get_dataset_size() | |||||
| print('origin dataset size: ', ori_dataset_size) | print('origin dataset size: ', ori_dataset_size) | ||||
| type_cast_op = C.TypeCast(mstype.int32) | type_cast_op = C.TypeCast(mstype.int32) | ||||
| ds = ds.map(operations=type_cast_op, input_columns="masked_lm_ids") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="masked_lm_positions") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="next_sentence_labels") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="segment_ids") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="input_mask") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="input_ids") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_ids") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_positions") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="next_sentence_labels") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="input_mask") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="input_ids") | |||||
| # apply batch operations | # apply batch operations | ||||
| ds = ds.batch(cfg.batch_size, drop_remainder=True) | |||||
| logger.info("data size: {}".format(ds.get_dataset_size())) | |||||
| logger.info("repeat count: {}".format(ds.get_repeat_count())) | |||||
| return ds | |||||
| data_set = data_set.batch(cfg.batch_size, drop_remainder=True) | |||||
| logger.info("data size: {}".format(data_set.get_dataset_size())) | |||||
| logger.info("repeat count: {}".format(data_set.get_repeat_count())) | |||||
| return data_set | |||||
| def create_ner_dataset(batch_size=1, repeat_count=1, assessment_method="accuracy", | def create_ner_dataset(batch_size=1, repeat_count=1, assessment_method="accuracy", | ||||
| data_file_path=None, schema_file_path=None, do_shuffle=True): | data_file_path=None, schema_file_path=None, do_shuffle=True): | ||||
| """create finetune or evaluation dataset""" | """create finetune or evaluation dataset""" | ||||
| type_cast_op = C.TypeCast(mstype.int32) | type_cast_op = C.TypeCast(mstype.int32) | ||||
| ds = de.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, | |||||
| columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"], shuffle=do_shuffle) | |||||
| data_set = ds.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, | |||||
| columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"], | |||||
| shuffle=do_shuffle) | |||||
| if assessment_method == "Spearman_correlation": | if assessment_method == "Spearman_correlation": | ||||
| type_cast_op_float = C.TypeCast(mstype.float32) | type_cast_op_float = C.TypeCast(mstype.float32) | ||||
| ds = ds.map(operations=type_cast_op_float, input_columns="label_ids") | |||||
| data_set = data_set.map(operations=type_cast_op_float, input_columns="label_ids") | |||||
| else: | else: | ||||
| ds = ds.map(operations=type_cast_op, input_columns="label_ids") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="segment_ids") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="input_mask") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="input_ids") | |||||
| ds = ds.repeat(repeat_count) | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="label_ids") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="input_mask") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="input_ids") | |||||
| data_set = data_set.repeat(repeat_count) | |||||
| # apply batch operations | # apply batch operations | ||||
| ds = ds.batch(batch_size, drop_remainder=True) | |||||
| return ds | |||||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||||
| return data_set | |||||
| def create_classification_dataset(batch_size=1, repeat_count=1, assessment_method="accuracy", | def create_classification_dataset(batch_size=1, repeat_count=1, assessment_method="accuracy", | ||||
| data_file_path=None, schema_file_path=None, do_shuffle=True): | data_file_path=None, schema_file_path=None, do_shuffle=True): | ||||
| """create finetune or evaluation dataset""" | """create finetune or evaluation dataset""" | ||||
| type_cast_op = C.TypeCast(mstype.int32) | type_cast_op = C.TypeCast(mstype.int32) | ||||
| ds = de.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, | |||||
| columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"], shuffle=do_shuffle) | |||||
| data_set = ds.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, | |||||
| columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"], | |||||
| shuffle=do_shuffle) | |||||
| if assessment_method == "Spearman_correlation": | if assessment_method == "Spearman_correlation": | ||||
| type_cast_op_float = C.TypeCast(mstype.float32) | type_cast_op_float = C.TypeCast(mstype.float32) | ||||
| ds = ds.map(operations=type_cast_op_float, input_columns="label_ids") | |||||
| data_set = data_set.map(operations=type_cast_op_float, input_columns="label_ids") | |||||
| else: | else: | ||||
| ds = ds.map(operations=type_cast_op, input_columns="label_ids") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="segment_ids") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="input_mask") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="input_ids") | |||||
| ds = ds.repeat(repeat_count) | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="label_ids") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="input_mask") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="input_ids") | |||||
| data_set = data_set.repeat(repeat_count) | |||||
| # apply batch operations | # apply batch operations | ||||
| ds = ds.batch(batch_size, drop_remainder=True) | |||||
| return ds | |||||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||||
| return data_set | |||||
| def generator_squad(data_features): | def generator_squad(data_features): | ||||
| @@ -102,20 +104,20 @@ def create_squad_dataset(batch_size=1, repeat_count=1, data_file_path=None, sche | |||||
| """create finetune or evaluation dataset""" | """create finetune or evaluation dataset""" | ||||
| type_cast_op = C.TypeCast(mstype.int32) | type_cast_op = C.TypeCast(mstype.int32) | ||||
| if is_training: | if is_training: | ||||
| ds = de.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, | |||||
| columns_list=["input_ids", "input_mask", "segment_ids", "start_positions", | |||||
| "end_positions", "unique_ids", "is_impossible"], | |||||
| shuffle=do_shuffle) | |||||
| ds = ds.map(operations=type_cast_op, input_columns="start_positions") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="end_positions") | |||||
| data_set = ds.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, | |||||
| columns_list=["input_ids", "input_mask", "segment_ids", "start_positions", | |||||
| "end_positions", "unique_ids", "is_impossible"], | |||||
| shuffle=do_shuffle) | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="start_positions") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="end_positions") | |||||
| else: | else: | ||||
| ds = de.GeneratorDataset(generator_squad(data_file_path), shuffle=do_shuffle, | |||||
| column_names=["input_ids", "input_mask", "segment_ids", "unique_ids"]) | |||||
| ds = ds.map(operations=type_cast_op, input_columns="segment_ids") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="input_mask") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="input_ids") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="unique_ids") | |||||
| ds = ds.repeat(repeat_count) | |||||
| data_set = ds.GeneratorDataset(generator_squad(data_file_path), shuffle=do_shuffle, | |||||
| column_names=["input_ids", "input_mask", "segment_ids", "unique_ids"]) | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="input_mask") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="input_ids") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="unique_ids") | |||||
| data_set = data_set.repeat(repeat_count) | |||||
| # apply batch operations | # apply batch operations | ||||
| ds = ds.batch(batch_size, drop_remainder=True) | |||||
| return ds | |||||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||||
| return data_set | |||||
| @@ -17,7 +17,7 @@ Data operations, will be used in run_pretrain.py | |||||
| """ | """ | ||||
| import os | import os | ||||
| import mindspore.common.dtype as mstype | import mindspore.common.dtype as mstype | ||||
| import mindspore.dataset.engine.datasets as de | |||||
| import mindspore.dataset as ds | |||||
| import mindspore.dataset.transforms.c_transforms as C | import mindspore.dataset.transforms.c_transforms as C | ||||
| from mindspore import log as logger | from mindspore import log as logger | ||||
| from .bert_net_config import bert_net_cfg | from .bert_net_config import bert_net_cfg | ||||
| @@ -32,96 +32,96 @@ def create_bert_dataset(device_num=1, rank=0, do_shuffle="true", data_dir=None, | |||||
| if "tfrecord" in file_name: | if "tfrecord" in file_name: | ||||
| data_files.append(os.path.join(data_dir, file_name)) | data_files.append(os.path.join(data_dir, file_name)) | ||||
| data_files = sorted(data_files) | data_files = sorted(data_files) | ||||
| ds = de.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None, | |||||
| columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", | |||||
| "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"], | |||||
| shuffle=de.Shuffle.FILES if do_shuffle == "true" else False, | |||||
| num_shards=device_num, shard_id=rank, shard_equal_rows=False) | |||||
| ori_dataset_size = ds.get_dataset_size() | |||||
| data_set = ds.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None, | |||||
| columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", | |||||
| "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"], | |||||
| shuffle=ds.Shuffle.FILES if do_shuffle == "true" else False, | |||||
| num_shards=device_num, shard_id=rank, shard_equal_rows=False) | |||||
| ori_dataset_size = data_set.get_dataset_size() | |||||
| print('origin dataset size: ', ori_dataset_size) | print('origin dataset size: ', ori_dataset_size) | ||||
| type_cast_op = C.TypeCast(mstype.int32) | type_cast_op = C.TypeCast(mstype.int32) | ||||
| ds = ds.map(operations=type_cast_op, input_columns="masked_lm_ids") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="masked_lm_positions") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="next_sentence_labels") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="segment_ids") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="input_mask") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="input_ids") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_ids") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_positions") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="next_sentence_labels") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="input_mask") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="input_ids") | |||||
| # apply batch operations | # apply batch operations | ||||
| ds = ds.batch(bert_net_cfg.batch_size, drop_remainder=True) | |||||
| logger.info("data size: {}".format(ds.get_dataset_size())) | |||||
| logger.info("repeat count: {}".format(ds.get_repeat_count())) | |||||
| return ds | |||||
| data_set = data_set.batch(bert_net_cfg.batch_size, drop_remainder=True) | |||||
| logger.info("data size: {}".format(data_set.get_dataset_size())) | |||||
| logger.info("repeat count: {}".format(data_set.get_repeat_count())) | |||||
| return data_set | |||||
| def create_ner_dataset(batch_size=1, repeat_count=1, assessment_method="accuracy", | def create_ner_dataset(batch_size=1, repeat_count=1, assessment_method="accuracy", | ||||
| data_file_path=None, schema_file_path=None): | data_file_path=None, schema_file_path=None): | ||||
| """create finetune or evaluation dataset""" | """create finetune or evaluation dataset""" | ||||
| type_cast_op = C.TypeCast(mstype.int32) | type_cast_op = C.TypeCast(mstype.int32) | ||||
| ds = de.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, | |||||
| columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"]) | |||||
| data_set = ds.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, | |||||
| columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"]) | |||||
| if assessment_method == "Spearman_correlation": | if assessment_method == "Spearman_correlation": | ||||
| type_cast_op_float = C.TypeCast(mstype.float32) | type_cast_op_float = C.TypeCast(mstype.float32) | ||||
| ds = ds.map(operations=type_cast_op_float, input_columns="label_ids") | |||||
| data_set = data_set.map(operations=type_cast_op_float, input_columns="label_ids") | |||||
| else: | else: | ||||
| ds = ds.map(operations=type_cast_op, input_columns="label_ids") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="segment_ids") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="input_mask") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="input_ids") | |||||
| ds = ds.repeat(repeat_count) | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="label_ids") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="input_mask") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="input_ids") | |||||
| data_set = data_set.repeat(repeat_count) | |||||
| # apply shuffle operation | # apply shuffle operation | ||||
| buffer_size = 960 | buffer_size = 960 | ||||
| ds = ds.shuffle(buffer_size=buffer_size) | |||||
| data_set = data_set.shuffle(buffer_size=buffer_size) | |||||
| # apply batch operations | # apply batch operations | ||||
| ds = ds.batch(batch_size, drop_remainder=True) | |||||
| return ds | |||||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||||
| return data_set | |||||
| def create_classification_dataset(batch_size=1, repeat_count=1, assessment_method="accuracy", | def create_classification_dataset(batch_size=1, repeat_count=1, assessment_method="accuracy", | ||||
| data_file_path=None, schema_file_path=None): | data_file_path=None, schema_file_path=None): | ||||
| """create finetune or evaluation dataset""" | """create finetune or evaluation dataset""" | ||||
| type_cast_op = C.TypeCast(mstype.int32) | type_cast_op = C.TypeCast(mstype.int32) | ||||
| ds = de.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, | |||||
| columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"]) | |||||
| data_set = ds.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, | |||||
| columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"]) | |||||
| if assessment_method == "Spearman_correlation": | if assessment_method == "Spearman_correlation": | ||||
| type_cast_op_float = C.TypeCast(mstype.float32) | type_cast_op_float = C.TypeCast(mstype.float32) | ||||
| ds = ds.map(operations=type_cast_op_float, input_columns="label_ids") | |||||
| data_set = data_set.map(operations=type_cast_op_float, input_columns="label_ids") | |||||
| else: | else: | ||||
| ds = ds.map(operations=type_cast_op, input_columns="label_ids") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="segment_ids") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="input_mask") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="input_ids") | |||||
| ds = ds.repeat(repeat_count) | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="label_ids") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="input_mask") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="input_ids") | |||||
| data_set = data_set.repeat(repeat_count) | |||||
| # apply shuffle operation | # apply shuffle operation | ||||
| buffer_size = 960 | buffer_size = 960 | ||||
| ds = ds.shuffle(buffer_size=buffer_size) | |||||
| data_set = data_set.shuffle(buffer_size=buffer_size) | |||||
| # apply batch operations | # apply batch operations | ||||
| ds = ds.batch(batch_size, drop_remainder=True) | |||||
| return ds | |||||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||||
| return data_set | |||||
| def create_squad_dataset(batch_size=1, repeat_count=1, data_file_path=None, schema_file_path=None, is_training=True): | def create_squad_dataset(batch_size=1, repeat_count=1, data_file_path=None, schema_file_path=None, is_training=True): | ||||
| """create finetune or evaluation dataset""" | """create finetune or evaluation dataset""" | ||||
| type_cast_op = C.TypeCast(mstype.int32) | type_cast_op = C.TypeCast(mstype.int32) | ||||
| if is_training: | if is_training: | ||||
| ds = de.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, | |||||
| columns_list=["input_ids", "input_mask", "segment_ids", | |||||
| "start_positions", "end_positions", | |||||
| "unique_ids", "is_impossible"]) | |||||
| ds = ds.map(operations=type_cast_op, input_columns="start_positions") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="end_positions") | |||||
| data_set = ds.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, | |||||
| columns_list=["input_ids", "input_mask", "segment_ids", | |||||
| "start_positions", "end_positions", | |||||
| "unique_ids", "is_impossible"]) | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="start_positions") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="end_positions") | |||||
| else: | else: | ||||
| ds = de.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, | |||||
| columns_list=["input_ids", "input_mask", "segment_ids", "unique_ids"]) | |||||
| ds = ds.map(operations=type_cast_op, input_columns="input_ids") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="input_mask") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="segment_ids") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="segment_ids") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="input_mask") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="input_ids") | |||||
| ds = ds.repeat(repeat_count) | |||||
| data_set = ds.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, | |||||
| columns_list=["input_ids", "input_mask", "segment_ids", "unique_ids"]) | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="input_ids") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="input_mask") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="input_mask") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="input_ids") | |||||
| data_set = data_set.repeat(repeat_count) | |||||
| # apply shuffle operation | # apply shuffle operation | ||||
| buffer_size = 960 | buffer_size = 960 | ||||
| ds = ds.shuffle(buffer_size=buffer_size) | |||||
| data_set = data_set.shuffle(buffer_size=buffer_size) | |||||
| # apply batch operations | # apply batch operations | ||||
| ds = ds.batch(batch_size, drop_remainder=True) | |||||
| return ds | |||||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||||
| return data_set | |||||
| @@ -22,7 +22,7 @@ import mindspore.ops.operations as P | |||||
| from mindspore.common.tensor import Tensor | from mindspore.common.tensor import Tensor | ||||
| from mindspore.train.model import Model | from mindspore.train.model import Model | ||||
| from mindspore.train.serialization import load_checkpoint, load_param_into_net | from mindspore.train.serialization import load_checkpoint, load_param_into_net | ||||
| import mindspore.dataset.engine as de | |||||
| import mindspore.dataset as ds | |||||
| import mindspore.dataset.transforms.c_transforms as deC | import mindspore.dataset.transforms.c_transforms as deC | ||||
| from mindspore import context | from mindspore import context | ||||
| from src.fasttext_model import FastText | from src.fasttext_model import FastText | ||||
| @@ -73,15 +73,15 @@ class FastTextInferCell(nn.Cell): | |||||
| def load_infer_dataset(batch_size, datafile): | def load_infer_dataset(batch_size, datafile): | ||||
| """data loader for infer""" | """data loader for infer""" | ||||
| ds = de.MindDataset(datafile, columns_list=['src_tokens', 'src_tokens_length', 'label_idx']) | |||||
| data_set = ds.MindDataset(datafile, columns_list=['src_tokens', 'src_tokens_length', 'label_idx']) | |||||
| type_cast_op = deC.TypeCast(mstype.int32) | type_cast_op = deC.TypeCast(mstype.int32) | ||||
| ds = ds.map(operations=type_cast_op, input_columns="src_tokens") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="src_tokens_length") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="label_idx") | |||||
| ds = ds.batch(batch_size=batch_size, drop_remainder=True) | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="src_tokens") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="src_tokens_length") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="label_idx") | |||||
| data_set = data_set.batch(batch_size=batch_size, drop_remainder=True) | |||||
| return ds | |||||
| return data_set | |||||
| def run_fasttext_infer(): | def run_fasttext_infer(): | ||||
| """run infer with FastText""" | """run infer with FastText""" | ||||
| @@ -25,8 +25,10 @@ import spacy | |||||
| from sklearn.feature_extraction import FeatureHasher | from sklearn.feature_extraction import FeatureHasher | ||||
| from mindspore.mindrecord import FileWriter | from mindspore.mindrecord import FileWriter | ||||
| class FastTextDataPreProcess(): | class FastTextDataPreProcess(): | ||||
| """FastText data preprocess""" | """FastText data preprocess""" | ||||
| def __init__(self, train_path, | def __init__(self, train_path, | ||||
| test_file, | test_file, | ||||
| max_length, | max_length, | ||||
| @@ -194,7 +196,6 @@ class FastTextDataPreProcess(): | |||||
| if self.text_less in sent_describe and self.text_greater in sent_describe: | if self.text_less in sent_describe and self.text_greater in sent_describe: | ||||
| sent_describe = self.str_html.sub('', sent_describe) | sent_describe = self.str_html.sub('', sent_describe) | ||||
| doc = spacy_nlp(sent_describe) | doc = spacy_nlp(sent_describe) | ||||
| bows_token = [token.text for token in doc] | bows_token = [token.text for token in doc] | ||||
| @@ -222,7 +223,7 @@ class FastTextDataPreProcess(): | |||||
| def _get_bucket_length(self, x, bts): | def _get_bucket_length(self, x, bts): | ||||
| x_len = len(x) | x_len = len(x) | ||||
| for index in range(1, len(bts)): | for index in range(1, len(bts)): | ||||
| if bts[index-1] < x_len <= bts[index]: | |||||
| if bts[index - 1] < x_len <= bts[index]: | |||||
| return bts[index] | return bts[index] | ||||
| return bts[0] | return bts[0] | ||||
| @@ -310,7 +311,6 @@ if __name__ == '__main__': | |||||
| print("Writing test data to MindRecord file.....") | print("Writing test data to MindRecord file.....") | ||||
| for k in args.test_bucket: | for k in args.test_bucket: | ||||
| write_to_mindrecord(test_data_example[k], './test_dataset_bs_' + str(k) + '.mindrecord', 1) | write_to_mindrecord(test_data_example[k], './test_dataset_bs_' + str(k) + '.mindrecord', 1) | ||||
| print("All done.....") | print("All done.....") | ||||
| @@ -14,9 +14,10 @@ | |||||
| # ============================================================================ | # ============================================================================ | ||||
| """FastText data loader""" | """FastText data loader""" | ||||
| import mindspore.common.dtype as mstype | import mindspore.common.dtype as mstype | ||||
| import mindspore.dataset.engine as de | |||||
| import mindspore.dataset as ds | |||||
| import mindspore.dataset.transforms.c_transforms as deC | import mindspore.dataset.transforms.c_transforms as deC | ||||
| def load_dataset(dataset_path, | def load_dataset(dataset_path, | ||||
| batch_size, | batch_size, | ||||
| epoch_count=1, | epoch_count=1, | ||||
| @@ -25,38 +26,40 @@ def load_dataset(dataset_path, | |||||
| bucket=None, | bucket=None, | ||||
| shuffle=True): | shuffle=True): | ||||
| """dataset loader""" | """dataset loader""" | ||||
| def batch_per_bucket(bucket_length, input_file): | def batch_per_bucket(bucket_length, input_file): | ||||
| input_file = input_file +'/train_dataset_bs_' + str(bucket_length) + '.mindrecord' | |||||
| input_file = input_file + '/train_dataset_bs_' + str(bucket_length) + '.mindrecord' | |||||
| if not input_file: | if not input_file: | ||||
| raise FileNotFoundError("input file parameter must not be empty.") | raise FileNotFoundError("input file parameter must not be empty.") | ||||
| ds = de.MindDataset(input_file, | |||||
| columns_list=['src_tokens', 'src_tokens_length', 'label_idx'], | |||||
| shuffle=shuffle, | |||||
| num_shards=rank_size, | |||||
| shard_id=rank_id, | |||||
| num_parallel_workers=8) | |||||
| ori_dataset_size = ds.get_dataset_size() | |||||
| data_set = ds.MindDataset(input_file, | |||||
| columns_list=['src_tokens', 'src_tokens_length', 'label_idx'], | |||||
| shuffle=shuffle, | |||||
| num_shards=rank_size, | |||||
| shard_id=rank_id, | |||||
| num_parallel_workers=8) | |||||
| ori_dataset_size = data_set.get_dataset_size() | |||||
| print(f"Dataset size: {ori_dataset_size}") | print(f"Dataset size: {ori_dataset_size}") | ||||
| repeat_count = epoch_count | repeat_count = epoch_count | ||||
| type_cast_op = deC.TypeCast(mstype.int32) | type_cast_op = deC.TypeCast(mstype.int32) | ||||
| ds = ds.map(operations=type_cast_op, input_columns="src_tokens") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="src_tokens_length") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="label_idx") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="src_tokens") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="src_tokens_length") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="label_idx") | |||||
| data_set = data_set.rename(input_columns=['src_tokens', 'src_tokens_length', 'label_idx'], | |||||
| output_columns=['src_token_text', 'src_tokens_text_length', 'label_idx_tag']) | |||||
| data_set = data_set.batch(batch_size, drop_remainder=False) | |||||
| data_set = data_set.repeat(repeat_count) | |||||
| return data_set | |||||
| ds = ds.rename(input_columns=['src_tokens', 'src_tokens_length', 'label_idx'], | |||||
| output_columns=['src_token_text', 'src_tokens_text_length', 'label_idx_tag']) | |||||
| ds = ds.batch(batch_size, drop_remainder=False) | |||||
| ds = ds.repeat(repeat_count) | |||||
| return ds | |||||
| for i, _ in enumerate(bucket): | for i, _ in enumerate(bucket): | ||||
| bucket_len = bucket[i] | bucket_len = bucket[i] | ||||
| ds_per = batch_per_bucket(bucket_len, dataset_path) | ds_per = batch_per_bucket(bucket_len, dataset_path) | ||||
| if i == 0: | if i == 0: | ||||
| ds = ds_per | |||||
| data_set = ds_per | |||||
| else: | else: | ||||
| ds = ds + ds_per | |||||
| ds = ds.shuffle(ds.get_dataset_size()) | |||||
| ds.channel_name = 'fasttext' | |||||
| data_set = data_set + ds_per | |||||
| data_set = data_set.shuffle(data_set.get_dataset_size()) | |||||
| data_set.channel_name = 'fasttext' | |||||
| return ds | |||||
| return data_set | |||||
| @@ -15,7 +15,7 @@ | |||||
| """Dataset loader to feed into model.""" | """Dataset loader to feed into model.""" | ||||
| import os | import os | ||||
| import mindspore.common.dtype as mstype | import mindspore.common.dtype as mstype | ||||
| import mindspore.dataset.engine as de | |||||
| import mindspore.dataset as ds | |||||
| import mindspore.dataset.transforms.c_transforms as deC | import mindspore.dataset.transforms.c_transforms as deC | ||||
| @@ -55,7 +55,7 @@ def _load_dataset(input_files, schema_file, batch_size, sink_mode=False, | |||||
| print(f" | Loading {datafile}.") | print(f" | Loading {datafile}.") | ||||
| if not is_translate: | if not is_translate: | ||||
| ds = de.MindDataset( | |||||
| data_set = ds.MindDataset( | |||||
| input_files, columns_list=[ | input_files, columns_list=[ | ||||
| "src", "src_padding", | "src", "src_padding", | ||||
| "prev_opt", | "prev_opt", | ||||
| @@ -64,18 +64,18 @@ def _load_dataset(input_files, schema_file, batch_size, sink_mode=False, | |||||
| num_parallel_workers=8 | num_parallel_workers=8 | ||||
| ) | ) | ||||
| ori_dataset_size = ds.get_dataset_size() | |||||
| ori_dataset_size = data_set.get_dataset_size() | |||||
| print(f" | Dataset size: {ori_dataset_size}.") | print(f" | Dataset size: {ori_dataset_size}.") | ||||
| if shuffle: | if shuffle: | ||||
| ds = ds.shuffle(buffer_size=ori_dataset_size // 20) | |||||
| data_set = data_set.shuffle(buffer_size=ori_dataset_size // 20) | |||||
| type_cast_op = deC.TypeCast(mstype.int32) | type_cast_op = deC.TypeCast(mstype.int32) | ||||
| ds = ds.map(input_columns="src", operations=type_cast_op, num_parallel_workers=8) | |||||
| ds = ds.map(input_columns="src_padding", operations=type_cast_op, num_parallel_workers=8) | |||||
| ds = ds.map(input_columns="prev_opt", operations=type_cast_op, num_parallel_workers=8) | |||||
| ds = ds.map(input_columns="target", operations=type_cast_op, num_parallel_workers=8) | |||||
| ds = ds.map(input_columns="tgt_padding", operations=type_cast_op, num_parallel_workers=8) | |||||
| data_set = data_set.map(input_columns="src", operations=type_cast_op, num_parallel_workers=8) | |||||
| data_set = data_set.map(input_columns="src_padding", operations=type_cast_op, num_parallel_workers=8) | |||||
| data_set = data_set.map(input_columns="prev_opt", operations=type_cast_op, num_parallel_workers=8) | |||||
| data_set = data_set.map(input_columns="target", operations=type_cast_op, num_parallel_workers=8) | |||||
| data_set = data_set.map(input_columns="tgt_padding", operations=type_cast_op, num_parallel_workers=8) | |||||
| ds = ds.rename( | |||||
| data_set = data_set.rename( | |||||
| input_columns=["src", | input_columns=["src", | ||||
| "src_padding", | "src_padding", | ||||
| "prev_opt", | "prev_opt", | ||||
| @@ -87,9 +87,9 @@ def _load_dataset(input_files, schema_file, batch_size, sink_mode=False, | |||||
| "target_eos_ids", | "target_eos_ids", | ||||
| "target_eos_mask"] | "target_eos_mask"] | ||||
| ) | ) | ||||
| ds = ds.batch(batch_size, drop_remainder=drop_remainder) | |||||
| data_set = data_set.batch(batch_size, drop_remainder=drop_remainder) | |||||
| else: | else: | ||||
| ds = de.MindDataset( | |||||
| data_set = ds.MindDataset( | |||||
| input_files, columns_list=[ | input_files, columns_list=[ | ||||
| "src", "src_padding" | "src", "src_padding" | ||||
| ], | ], | ||||
| @@ -97,23 +97,23 @@ def _load_dataset(input_files, schema_file, batch_size, sink_mode=False, | |||||
| num_parallel_workers=8 | num_parallel_workers=8 | ||||
| ) | ) | ||||
| ori_dataset_size = ds.get_dataset_size() | |||||
| ori_dataset_size = data_set.get_dataset_size() | |||||
| print(f" | Dataset size: {ori_dataset_size}.") | print(f" | Dataset size: {ori_dataset_size}.") | ||||
| if shuffle: | if shuffle: | ||||
| ds = ds.shuffle(buffer_size=ori_dataset_size // 20) | |||||
| data_set = data_set.shuffle(buffer_size=ori_dataset_size // 20) | |||||
| type_cast_op = deC.TypeCast(mstype.int32) | type_cast_op = deC.TypeCast(mstype.int32) | ||||
| ds = ds.map(input_columns="src", operations=type_cast_op, num_parallel_workers=8) | |||||
| ds = ds.map(input_columns="src_padding", operations=type_cast_op, num_parallel_workers=8) | |||||
| data_set = data_set.map(input_columns="src", operations=type_cast_op, num_parallel_workers=8) | |||||
| data_set = data_set.map(input_columns="src_padding", operations=type_cast_op, num_parallel_workers=8) | |||||
| ds = ds.rename( | |||||
| data_set = data_set.rename( | |||||
| input_columns=["src", | input_columns=["src", | ||||
| "src_padding"], | "src_padding"], | ||||
| output_columns=["source_eos_ids", | output_columns=["source_eos_ids", | ||||
| "source_eos_mask"] | "source_eos_mask"] | ||||
| ) | ) | ||||
| ds = ds.batch(batch_size, drop_remainder=drop_remainder) | |||||
| data_set = data_set.batch(batch_size, drop_remainder=drop_remainder) | |||||
| return ds | |||||
| return data_set | |||||
| def load_dataset(data_files: list, schema: str, batch_size: int, sink_mode: bool, | def load_dataset(data_files: list, schema: str, batch_size: int, sink_mode: bool, | ||||
| @@ -14,7 +14,7 @@ | |||||
| # ============================================================================ | # ============================================================================ | ||||
| """Dataset loader to feed into model.""" | """Dataset loader to feed into model.""" | ||||
| import mindspore.common.dtype as mstype | import mindspore.common.dtype as mstype | ||||
| import mindspore.dataset.engine as de | |||||
| import mindspore.dataset as ds | |||||
| import mindspore.dataset.transforms.c_transforms as deC | import mindspore.dataset.transforms.c_transforms as deC | ||||
| @@ -45,7 +45,7 @@ def _load_dataset(input_files, batch_size, epoch_count=1, | |||||
| for datafile in input_files: | for datafile in input_files: | ||||
| print(f" | Loading {datafile}.") | print(f" | Loading {datafile}.") | ||||
| ds = de.TFRecordDataset( | |||||
| data_set = ds.TFRecordDataset( | |||||
| input_files, | input_files, | ||||
| columns_list=[ | columns_list=[ | ||||
| "src", "src_padding", | "src", "src_padding", | ||||
| @@ -55,19 +55,19 @@ def _load_dataset(input_files, batch_size, epoch_count=1, | |||||
| shuffle=shuffle, num_shards=rank_size, shard_id=rank_id, | shuffle=shuffle, num_shards=rank_size, shard_id=rank_id, | ||||
| shard_equal_rows=True, num_parallel_workers=8) | shard_equal_rows=True, num_parallel_workers=8) | ||||
| ori_dataset_size = ds.get_dataset_size() | |||||
| ori_dataset_size = data_set.get_dataset_size() | |||||
| print(f" | Dataset size: {ori_dataset_size}.") | print(f" | Dataset size: {ori_dataset_size}.") | ||||
| repeat_count = epoch_count | repeat_count = epoch_count | ||||
| type_cast_op = deC.TypeCast(mstype.int32) | type_cast_op = deC.TypeCast(mstype.int32) | ||||
| ds = ds.map(operations=type_cast_op, input_columns="src") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="src_padding") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="prev_opt") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="prev_padding") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="target") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="tgt_padding") | |||||
| ds = ds.rename( | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="src") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="src_padding") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="prev_opt") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="prev_padding") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="target") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="tgt_padding") | |||||
| data_set = data_set.rename( | |||||
| input_columns=["src", | input_columns=["src", | ||||
| "src_padding", | "src_padding", | ||||
| "prev_opt", | "prev_opt", | ||||
| @@ -82,11 +82,11 @@ def _load_dataset(input_files, batch_size, epoch_count=1, | |||||
| "target_eos_mask"] | "target_eos_mask"] | ||||
| ) | ) | ||||
| ds = ds.batch(batch_size, drop_remainder=True) | |||||
| ds = ds.repeat(repeat_count) | |||||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||||
| data_set = data_set.repeat(repeat_count) | |||||
| ds.channel_name = 'transformer' | |||||
| return ds | |||||
| data_set.channel_name = 'transformer' | |||||
| return data_set | |||||
| def load_dataset(data_files: list, batch_size: int, epoch_count: int, | def load_dataset(data_files: list, batch_size: int, epoch_count: int, | ||||
| @@ -14,7 +14,7 @@ | |||||
| # ============================================================================ | # ============================================================================ | ||||
| """Dataset loader to feed into model.""" | """Dataset loader to feed into model.""" | ||||
| import mindspore.common.dtype as mstype | import mindspore.common.dtype as mstype | ||||
| import mindspore.dataset.engine as de | |||||
| import mindspore.dataset as ds | |||||
| import mindspore.dataset.transforms.c_transforms as deC | import mindspore.dataset.transforms.c_transforms as deC | ||||
| @@ -45,7 +45,7 @@ def _load_dataset(input_files, batch_size, epoch_count=1, | |||||
| for datafile in input_files: | for datafile in input_files: | ||||
| print(f" | Loading {datafile}.") | print(f" | Loading {datafile}.") | ||||
| ds = de.TFRecordDataset( | |||||
| data_set = ds.TFRecordDataset( | |||||
| input_files, | input_files, | ||||
| columns_list=[ | columns_list=[ | ||||
| "src", "src_padding", | "src", "src_padding", | ||||
| @@ -55,19 +55,19 @@ def _load_dataset(input_files, batch_size, epoch_count=1, | |||||
| shuffle=shuffle, num_shards=rank_size, shard_id=rank_id, | shuffle=shuffle, num_shards=rank_size, shard_id=rank_id, | ||||
| shard_equal_rows=True, num_parallel_workers=8) | shard_equal_rows=True, num_parallel_workers=8) | ||||
| ori_dataset_size = ds.get_dataset_size() | |||||
| ori_dataset_size = data_set.get_dataset_size() | |||||
| print(f" | Dataset size: {ori_dataset_size}.") | print(f" | Dataset size: {ori_dataset_size}.") | ||||
| repeat_count = epoch_count | repeat_count = epoch_count | ||||
| type_cast_op = deC.TypeCast(mstype.int32) | type_cast_op = deC.TypeCast(mstype.int32) | ||||
| ds = ds.map(input_columns="src", operations=type_cast_op) | |||||
| ds = ds.map(input_columns="src_padding", operations=type_cast_op) | |||||
| ds = ds.map(input_columns="prev_opt", operations=type_cast_op) | |||||
| ds = ds.map(input_columns="prev_padding", operations=type_cast_op) | |||||
| ds = ds.map(input_columns="target", operations=type_cast_op) | |||||
| ds = ds.map(input_columns="tgt_padding", operations=type_cast_op) | |||||
| ds = ds.rename( | |||||
| data_set = data_set.map(input_columns="src", operations=type_cast_op) | |||||
| data_set = data_set.map(input_columns="src_padding", operations=type_cast_op) | |||||
| data_set = data_set.map(input_columns="prev_opt", operations=type_cast_op) | |||||
| data_set = data_set.map(input_columns="prev_padding", operations=type_cast_op) | |||||
| data_set = data_set.map(input_columns="target", operations=type_cast_op) | |||||
| data_set = data_set.map(input_columns="tgt_padding", operations=type_cast_op) | |||||
| data_set = data_set.rename( | |||||
| input_columns=["src", | input_columns=["src", | ||||
| "src_padding", | "src_padding", | ||||
| "prev_opt", | "prev_opt", | ||||
| @@ -82,11 +82,11 @@ def _load_dataset(input_files, batch_size, epoch_count=1, | |||||
| "target_eos_mask"] | "target_eos_mask"] | ||||
| ) | ) | ||||
| ds = ds.batch(batch_size, drop_remainder=True) | |||||
| ds = ds.repeat(repeat_count) | |||||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||||
| data_set = data_set.repeat(repeat_count) | |||||
| ds.channel_name = 'transformer' | |||||
| return ds | |||||
| data_set.channel_name = 'transformer' | |||||
| return data_set | |||||
| def load_dataset(data_files: list, batch_size: int, epoch_count: int, | def load_dataset(data_files: list, batch_size: int, epoch_count: int, | ||||
| @@ -18,14 +18,16 @@ | |||||
| import os | import os | ||||
| from enum import Enum | from enum import Enum | ||||
| import mindspore.common.dtype as mstype | import mindspore.common.dtype as mstype | ||||
| import mindspore.dataset.engine.datasets as de | |||||
| import mindspore.dataset as ds | |||||
| import mindspore.dataset.transforms.c_transforms as C | import mindspore.dataset.transforms.c_transforms as C | ||||
| class DataType(Enum): | class DataType(Enum): | ||||
| """Enumerate supported dataset format""" | """Enumerate supported dataset format""" | ||||
| TFRECORD = 1 | TFRECORD = 1 | ||||
| MINDRECORD = 2 | MINDRECORD = 2 | ||||
| def create_tinybert_dataset(task='td', batch_size=32, device_num=1, rank=0, | def create_tinybert_dataset(task='td', batch_size=32, device_num=1, rank=0, | ||||
| do_shuffle="true", data_dir=None, schema_dir=None, | do_shuffle="true", data_dir=None, schema_dir=None, | ||||
| data_type=DataType.TFRECORD): | data_type=DataType.TFRECORD): | ||||
| @@ -47,22 +49,22 @@ def create_tinybert_dataset(task='td', batch_size=32, device_num=1, rank=0, | |||||
| shuffle = False | shuffle = False | ||||
| if data_type == DataType.MINDRECORD: | if data_type == DataType.MINDRECORD: | ||||
| ds = de.MindDataset(data_files, columns_list=columns_list, | |||||
| shuffle=(do_shuffle == "true"), num_shards=device_num, shard_id=rank) | |||||
| data_set = ds.MindDataset(data_files, columns_list=columns_list, | |||||
| shuffle=(do_shuffle == "true"), num_shards=device_num, shard_id=rank) | |||||
| else: | else: | ||||
| ds = de.TFRecordDataset(data_files, schema_dir, columns_list=columns_list, | |||||
| shuffle=shuffle, num_shards=device_num, shard_id=rank, | |||||
| shard_equal_rows=shard_equal_rows) | |||||
| data_set = ds.TFRecordDataset(data_files, schema_dir, columns_list=columns_list, | |||||
| shuffle=shuffle, num_shards=device_num, shard_id=rank, | |||||
| shard_equal_rows=shard_equal_rows) | |||||
| if device_num == 1 and shuffle is True: | if device_num == 1 and shuffle is True: | ||||
| ds = ds.shuffle(10000) | |||||
| data_set = data_set.shuffle(10000) | |||||
| type_cast_op = C.TypeCast(mstype.int32) | type_cast_op = C.TypeCast(mstype.int32) | ||||
| ds = ds.map(operations=type_cast_op, input_columns="segment_ids") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="input_mask") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="input_ids") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="input_mask") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="input_ids") | |||||
| if task == "td": | if task == "td": | ||||
| ds = ds.map(operations=type_cast_op, input_columns="label_ids") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="label_ids") | |||||
| # apply batch operations | # apply batch operations | ||||
| ds = ds.batch(batch_size, drop_remainder=True) | |||||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||||
| return ds | |||||
| return data_set | |||||
| @@ -23,38 +23,41 @@ from mindspore.common.parameter import Parameter | |||||
| from mindspore.common.tensor import Tensor | from mindspore.common.tensor import Tensor | ||||
| from mindspore.train.model import Model | from mindspore.train.model import Model | ||||
| from mindspore.train.serialization import load_checkpoint, load_param_into_net | from mindspore.train.serialization import load_checkpoint, load_param_into_net | ||||
| import mindspore.dataset.engine as de | |||||
| import mindspore.dataset as ds | |||||
| import mindspore.dataset.transforms.c_transforms as deC | import mindspore.dataset.transforms.c_transforms as deC | ||||
| from mindspore import context | from mindspore import context | ||||
| from src.transformer_model import TransformerModel | from src.transformer_model import TransformerModel | ||||
| from src.eval_config import cfg, transformer_net_cfg | from src.eval_config import cfg, transformer_net_cfg | ||||
| def load_test_data(batch_size=1, data_file=None): | def load_test_data(batch_size=1, data_file=None): | ||||
| """ | """ | ||||
| Load test dataset | Load test dataset | ||||
| """ | """ | ||||
| ds = de.MindDataset(data_file, | |||||
| columns_list=["source_eos_ids", "source_eos_mask", | |||||
| "target_sos_ids", "target_sos_mask", | |||||
| "target_eos_ids", "target_eos_mask"], | |||||
| shuffle=False) | |||||
| data_set = ds.MindDataset(data_file, | |||||
| columns_list=["source_eos_ids", "source_eos_mask", | |||||
| "target_sos_ids", "target_sos_mask", | |||||
| "target_eos_ids", "target_eos_mask"], | |||||
| shuffle=False) | |||||
| type_cast_op = deC.TypeCast(mstype.int32) | type_cast_op = deC.TypeCast(mstype.int32) | ||||
| ds = ds.map(operations=type_cast_op, input_columns="source_eos_ids") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="source_eos_mask") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="target_sos_ids") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="target_sos_mask") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="target_eos_ids") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="target_eos_mask") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="source_eos_ids") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="source_eos_mask") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="target_sos_ids") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="target_sos_mask") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="target_eos_ids") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="target_eos_mask") | |||||
| # apply batch operations | # apply batch operations | ||||
| ds = ds.batch(batch_size, drop_remainder=True) | |||||
| ds.channel_name = 'transformer' | |||||
| return ds | |||||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||||
| data_set.channel_name = 'transformer' | |||||
| return data_set | |||||
| class TransformerInferCell(nn.Cell): | class TransformerInferCell(nn.Cell): | ||||
| """ | """ | ||||
| Encapsulation class of transformer network infer. | Encapsulation class of transformer network infer. | ||||
| """ | """ | ||||
| def __init__(self, network): | def __init__(self, network): | ||||
| super(TransformerInferCell, self).__init__(auto_prefix=False) | super(TransformerInferCell, self).__init__(auto_prefix=False) | ||||
| self.network = network | self.network = network | ||||
| @@ -65,6 +68,7 @@ class TransformerInferCell(nn.Cell): | |||||
| predicted_ids = self.network(source_ids, source_mask) | predicted_ids = self.network(source_ids, source_mask) | ||||
| return predicted_ids | return predicted_ids | ||||
| def load_weights(model_path): | def load_weights(model_path): | ||||
| """ | """ | ||||
| Load checkpoint as parameter dict, support both npz file and mindspore checkpoint file. | Load checkpoint as parameter dict, support both npz file and mindspore checkpoint file. | ||||
| @@ -93,6 +97,7 @@ def load_weights(model_path): | |||||
| parameter_dict[name] = Parameter(Tensor(weights[name]), name=name) | parameter_dict[name] = Parameter(Tensor(weights[name]), name=name) | ||||
| return parameter_dict | return parameter_dict | ||||
| def run_transformer_eval(): | def run_transformer_eval(): | ||||
| """ | """ | ||||
| Transformer evaluation. | Transformer evaluation. | ||||
| @@ -136,5 +141,6 @@ def run_transformer_eval(): | |||||
| f.write(" ".join(token_ids) + "\n") | f.write(" ".join(token_ids) + "\n") | ||||
| f.close() | f.close() | ||||
| if __name__ == "__main__": | if __name__ == "__main__": | ||||
| run_transformer_eval() | run_transformer_eval() | ||||
| @@ -21,7 +21,7 @@ from enum import Enum | |||||
| import numpy as np | import numpy as np | ||||
| import pandas as pd | import pandas as pd | ||||
| import mindspore.dataset.engine as de | |||||
| import mindspore.dataset as ds | |||||
| import mindspore.common.dtype as mstype | import mindspore.common.dtype as mstype | ||||
| from .config import DataConfig | from .config import DataConfig | ||||
| @@ -142,8 +142,8 @@ class H5Dataset(): | |||||
| X_id = X[:, 0:self.max_length] | X_id = X[:, 0:self.max_length] | ||||
| X_va = X[:, self.max_length:] | X_va = X[:, self.max_length:] | ||||
| yield np.array(X_id.astype(dtype=np.int32)), \ | yield np.array(X_id.astype(dtype=np.int32)), \ | ||||
| np.array(X_va.astype(dtype=np.float32)), \ | |||||
| np.array(y.astype(dtype=np.float32)) | |||||
| np.array(X_va.astype(dtype=np.float32)), \ | |||||
| np.array(y.astype(dtype=np.float32)) | |||||
| def _get_h5_dataset(directory, train_mode=True, epochs=1, batch_size=1000): | def _get_h5_dataset(directory, train_mode=True, epochs=1, batch_size=1000): | ||||
| @@ -172,9 +172,9 @@ def _get_h5_dataset(directory, train_mode=True, epochs=1, batch_size=1000): | |||||
| for _ in range(0, numbers_of_batch, 1): | for _ in range(0, numbers_of_batch, 1): | ||||
| yield train_eval_gen.__next__() | yield train_eval_gen.__next__() | ||||
| ds = de.GeneratorDataset(_iter_h5_data, ["ids", "weights", "labels"]) | |||||
| ds = ds.repeat(epochs) | |||||
| return ds | |||||
| data_set = ds.GeneratorDataset(_iter_h5_data, ["ids", "weights", "labels"]) | |||||
| data_set = data_set.repeat(epochs) | |||||
| return data_set | |||||
| def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=1000, | def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=1000, | ||||
| @@ -199,23 +199,23 @@ def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=100 | |||||
| shuffle = train_mode | shuffle = train_mode | ||||
| if rank_size is not None and rank_id is not None: | if rank_size is not None and rank_id is not None: | ||||
| ds = de.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), | |||||
| columns_list=['feat_ids', 'feat_vals', 'label'], | |||||
| num_shards=rank_size, shard_id=rank_id, shuffle=shuffle, | |||||
| num_parallel_workers=8) | |||||
| data_set = ds.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), | |||||
| columns_list=['feat_ids', 'feat_vals', 'label'], | |||||
| num_shards=rank_size, shard_id=rank_id, shuffle=shuffle, | |||||
| num_parallel_workers=8) | |||||
| else: | else: | ||||
| ds = de.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), | |||||
| columns_list=['feat_ids', 'feat_vals', 'label'], | |||||
| shuffle=shuffle, num_parallel_workers=8) | |||||
| ds = ds.batch(int(batch_size / line_per_sample), drop_remainder=True) | |||||
| ds = ds.map(operations=(lambda x, y, z: (np.array(x).flatten().reshape(batch_size, 39), | |||||
| np.array(y).flatten().reshape(batch_size, 39), | |||||
| np.array(z).flatten().reshape(batch_size, 1))), | |||||
| input_columns=['feat_ids', 'feat_vals', 'label'], | |||||
| column_order=['feat_ids', 'feat_vals', 'label'], | |||||
| num_parallel_workers=8) | |||||
| ds = ds.repeat(epochs) | |||||
| return ds | |||||
| data_set = ds.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), | |||||
| columns_list=['feat_ids', 'feat_vals', 'label'], | |||||
| shuffle=shuffle, num_parallel_workers=8) | |||||
| data_set = data_set.batch(int(batch_size / line_per_sample), drop_remainder=True) | |||||
| data_set = data_set.map(operations=(lambda x, y, z: (np.array(x).flatten().reshape(batch_size, 39), | |||||
| np.array(y).flatten().reshape(batch_size, 39), | |||||
| np.array(z).flatten().reshape(batch_size, 1))), | |||||
| input_columns=['feat_ids', 'feat_vals', 'label'], | |||||
| column_order=['feat_ids', 'feat_vals', 'label'], | |||||
| num_parallel_workers=8) | |||||
| data_set = data_set.repeat(epochs) | |||||
| return data_set | |||||
| def _get_tf_dataset(directory, train_mode=True, epochs=1, batch_size=1000, | def _get_tf_dataset(directory, train_mode=True, epochs=1, batch_size=1000, | ||||
| @@ -242,28 +242,28 @@ def _get_tf_dataset(directory, train_mode=True, epochs=1, batch_size=1000, | |||||
| for filename in filenames: | for filename in filenames: | ||||
| if file_prefixt_name in filename and 'tfrecord' in filename: | if file_prefixt_name in filename and 'tfrecord' in filename: | ||||
| dataset_files.append(os.path.join(dir_path, filename)) | dataset_files.append(os.path.join(dir_path, filename)) | ||||
| schema = de.Schema() | |||||
| schema = ds.Schema() | |||||
| schema.add_column('feat_ids', de_type=mstype.int32) | schema.add_column('feat_ids', de_type=mstype.int32) | ||||
| schema.add_column('feat_vals', de_type=mstype.float32) | schema.add_column('feat_vals', de_type=mstype.float32) | ||||
| schema.add_column('label', de_type=mstype.float32) | schema.add_column('label', de_type=mstype.float32) | ||||
| if rank_size is not None and rank_id is not None: | if rank_size is not None and rank_id is not None: | ||||
| ds = de.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, | |||||
| schema=schema, num_parallel_workers=8, | |||||
| num_shards=rank_size, shard_id=rank_id, | |||||
| shard_equal_rows=True) | |||||
| data_set = ds.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, | |||||
| schema=schema, num_parallel_workers=8, | |||||
| num_shards=rank_size, shard_id=rank_id, | |||||
| shard_equal_rows=True) | |||||
| else: | else: | ||||
| ds = de.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, | |||||
| schema=schema, num_parallel_workers=8) | |||||
| ds = ds.batch(int(batch_size / line_per_sample), drop_remainder=True) | |||||
| ds = ds.map(operations=(lambda x, y, z: ( | |||||
| data_set = ds.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, | |||||
| schema=schema, num_parallel_workers=8) | |||||
| data_set = data_set.batch(int(batch_size / line_per_sample), drop_remainder=True) | |||||
| data_set = data_set.map(operations=(lambda x, y, z: ( | |||||
| np.array(x).flatten().reshape(batch_size, 39), | np.array(x).flatten().reshape(batch_size, 39), | ||||
| np.array(y).flatten().reshape(batch_size, 39), | np.array(y).flatten().reshape(batch_size, 39), | ||||
| np.array(z).flatten().reshape(batch_size, 1))), | np.array(z).flatten().reshape(batch_size, 1))), | ||||
| input_columns=['feat_ids', 'feat_vals', 'label'], | |||||
| column_order=['feat_ids', 'feat_vals', 'label'], | |||||
| num_parallel_workers=8) | |||||
| ds = ds.repeat(epochs) | |||||
| return ds | |||||
| input_columns=['feat_ids', 'feat_vals', 'label'], | |||||
| column_order=['feat_ids', 'feat_vals', 'label'], | |||||
| num_parallel_workers=8) | |||||
| data_set = data_set.repeat(epochs) | |||||
| return data_set | |||||
| def create_dataset(directory, train_mode=True, epochs=1, batch_size=1000, | def create_dataset(directory, train_mode=True, epochs=1, batch_size=1000, | ||||
| @@ -14,13 +14,12 @@ | |||||
| # ============================================================================ | # ============================================================================ | ||||
| """train_dataset.""" | """train_dataset.""" | ||||
| import os | import os | ||||
| import math | import math | ||||
| from enum import Enum | from enum import Enum | ||||
| import numpy as np | import numpy as np | ||||
| import pandas as pd | import pandas as pd | ||||
| import mindspore.dataset.engine as de | |||||
| import mindspore.dataset as ds | |||||
| import mindspore.common.dtype as mstype | import mindspore.common.dtype as mstype | ||||
| @@ -84,9 +83,9 @@ class H5Dataset(): | |||||
| yield os.path.join(self._hdf_data_dir, | yield os.path.join(self._hdf_data_dir, | ||||
| self._file_prefix + '_input_part_' + str( | self._file_prefix + '_input_part_' + str( | ||||
| p) + '.h5'), \ | p) + '.h5'), \ | ||||
| os.path.join(self._hdf_data_dir, | |||||
| self._file_prefix + '_output_part_' + str( | |||||
| p) + '.h5'), i + 1 == len(parts) | |||||
| os.path.join(self._hdf_data_dir, | |||||
| self._file_prefix + '_output_part_' + str( | |||||
| p) + '.h5'), i + 1 == len(parts) | |||||
| def _generator(self, X, y, batch_size, shuffle=True): | def _generator(self, X, y, batch_size, shuffle=True): | ||||
| """ | """ | ||||
| @@ -106,8 +105,7 @@ class H5Dataset(): | |||||
| np.random.shuffle(sample_index) | np.random.shuffle(sample_index) | ||||
| assert X.shape[0] > 0 | assert X.shape[0] > 0 | ||||
| while True: | while True: | ||||
| batch_index = sample_index[ | |||||
| batch_size * counter: batch_size * (counter + 1)] | |||||
| batch_index = sample_index[batch_size * counter: batch_size * (counter + 1)] | |||||
| X_batch = X[batch_index] | X_batch = X[batch_index] | ||||
| y_batch = y[batch_index] | y_batch = y[batch_index] | ||||
| counter += 1 | counter += 1 | ||||
| @@ -140,9 +138,8 @@ class H5Dataset(): | |||||
| X, y, finished = data_gen.__next__() | X, y, finished = data_gen.__next__() | ||||
| X_id = X[:, 0:self.input_length] | X_id = X[:, 0:self.input_length] | ||||
| X_va = X[:, self.input_length:] | X_va = X[:, self.input_length:] | ||||
| yield np.array(X_id.astype(dtype=np.int32)), np.array( | |||||
| X_va.astype(dtype=np.float32)), np.array( | |||||
| y.astype(dtype=np.float32)) | |||||
| yield np.array(X_id.astype(dtype=np.int32)), np.array(X_va.astype(dtype=np.float32)), np.array( | |||||
| y.astype(dtype=np.float32)) | |||||
| def _get_h5_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000): | def _get_h5_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000): | ||||
| @@ -164,9 +161,9 @@ def _get_h5_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000): | |||||
| for _ in range(0, numbers_of_batch, 1): | for _ in range(0, numbers_of_batch, 1): | ||||
| yield train_eval_gen.__next__() | yield train_eval_gen.__next__() | ||||
| ds = de.GeneratorDataset(_iter_h5_data(), ["ids", "weights", "labels"]) | |||||
| ds = ds.repeat(epochs) | |||||
| return ds | |||||
| data_set = ds.GeneratorDataset(_iter_h5_data(), ["ids", "weights", "labels"]) | |||||
| data_set = data_set.repeat(epochs) | |||||
| return data_set | |||||
| def _padding_func(batch_size, manual_shape, target_column, field_size=39): | def _padding_func(batch_size, manual_shape, target_column, field_size=39): | ||||
| @@ -174,11 +171,11 @@ def _padding_func(batch_size, manual_shape, target_column, field_size=39): | |||||
| get padding_func | get padding_func | ||||
| """ | """ | ||||
| if manual_shape: | if manual_shape: | ||||
| generate_concat_offset = [item[0]+item[1] for item in manual_shape] | |||||
| generate_concat_offset = [item[0] + item[1] for item in manual_shape] | |||||
| part_size = int(target_column / len(generate_concat_offset)) | part_size = int(target_column / len(generate_concat_offset)) | ||||
| filled_value = [] | filled_value = [] | ||||
| for i in range(field_size, target_column): | for i in range(field_size, target_column): | ||||
| filled_value.append(generate_concat_offset[i//part_size]-1) | |||||
| filled_value.append(generate_concat_offset[i // part_size] - 1) | |||||
| print("Filed Value:", filled_value) | print("Filed Value:", filled_value) | ||||
| def padding_func(x, y, z): | def padding_func(x, y, z): | ||||
| @@ -190,7 +187,7 @@ def _padding_func(batch_size, manual_shape, target_column, field_size=39): | |||||
| dtype=np.int32) * filled_value | dtype=np.int32) * filled_value | ||||
| x_id = np.concatenate([x, x_id.astype(dtype=np.int32)], axis=1) | x_id = np.concatenate([x, x_id.astype(dtype=np.int32)], axis=1) | ||||
| mask = np.concatenate( | mask = np.concatenate( | ||||
| [y, np.zeros((batch_size, target_column-39), dtype=np.float32)], axis=1) | |||||
| [y, np.zeros((batch_size, target_column - 39), dtype=np.float32)], axis=1) | |||||
| return (x_id, mask, z) | return (x_id, mask, z) | ||||
| else: | else: | ||||
| def padding_func(x, y, z): | def padding_func(x, y, z): | ||||
| @@ -214,24 +211,25 @@ def _get_tf_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000, | |||||
| for filename in filenames: | for filename in filenames: | ||||
| if file_prefix_name in filename and "tfrecord" in filename: | if file_prefix_name in filename and "tfrecord" in filename: | ||||
| dataset_files.append(os.path.join(dirpath, filename)) | dataset_files.append(os.path.join(dirpath, filename)) | ||||
| schema = de.Schema() | |||||
| schema = ds.Schema() | |||||
| schema.add_column('feat_ids', de_type=mstype.int32) | schema.add_column('feat_ids', de_type=mstype.int32) | ||||
| schema.add_column('feat_vals', de_type=mstype.float32) | schema.add_column('feat_vals', de_type=mstype.float32) | ||||
| schema.add_column('label', de_type=mstype.float32) | schema.add_column('label', de_type=mstype.float32) | ||||
| if rank_size is not None and rank_id is not None: | if rank_size is not None and rank_id is not None: | ||||
| ds = de.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, schema=schema, num_parallel_workers=8, | |||||
| num_shards=rank_size, shard_id=rank_id, shard_equal_rows=True) | |||||
| data_set = ds.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, schema=schema, | |||||
| num_parallel_workers=8, | |||||
| num_shards=rank_size, shard_id=rank_id, shard_equal_rows=True) | |||||
| else: | else: | ||||
| ds = de.TFRecordDataset(dataset_files=dataset_files, | |||||
| shuffle=shuffle, schema=schema, num_parallel_workers=8) | |||||
| ds = ds.batch(int(batch_size / line_per_sample), | |||||
| drop_remainder=True) | |||||
| data_set = ds.TFRecordDataset(dataset_files=dataset_files, | |||||
| shuffle=shuffle, schema=schema, num_parallel_workers=8) | |||||
| data_set = data_set.batch(int(batch_size / line_per_sample), | |||||
| drop_remainder=True) | |||||
| ds = ds.map(operations=_padding_func(batch_size, manual_shape, target_column), | |||||
| input_columns=['feat_ids', 'feat_vals', 'label'], | |||||
| column_order=['feat_ids', 'feat_vals', 'label'], num_parallel_workers=8) | |||||
| ds = ds.repeat(epochs) | |||||
| return ds | |||||
| data_set = data_set.map(operations=_padding_func(batch_size, manual_shape, target_column), | |||||
| input_columns=['feat_ids', 'feat_vals', 'label'], | |||||
| column_order=['feat_ids', 'feat_vals', 'label'], num_parallel_workers=8) | |||||
| data_set = data_set.repeat(epochs) | |||||
| return data_set | |||||
| def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=1000, | def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=1000, | ||||
| @@ -257,21 +255,21 @@ def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=100 | |||||
| shuffle = train_mode | shuffle = train_mode | ||||
| if rank_size is not None and rank_id is not None: | if rank_size is not None and rank_id is not None: | ||||
| ds = de.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), | |||||
| columns_list=['feat_ids', 'feat_vals', 'label'], | |||||
| num_shards=rank_size, shard_id=rank_id, shuffle=shuffle, | |||||
| num_parallel_workers=8) | |||||
| data_set = ds.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), | |||||
| columns_list=['feat_ids', 'feat_vals', 'label'], | |||||
| num_shards=rank_size, shard_id=rank_id, shuffle=shuffle, | |||||
| num_parallel_workers=8) | |||||
| else: | else: | ||||
| ds = de.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), | |||||
| columns_list=['feat_ids', 'feat_vals', 'label'], | |||||
| shuffle=shuffle, num_parallel_workers=8) | |||||
| ds = ds.batch(int(batch_size / line_per_sample), drop_remainder=True) | |||||
| ds = ds.map(_padding_func(batch_size, manual_shape, target_column), | |||||
| input_columns=['feat_ids', 'feat_vals', 'label'], | |||||
| column_order=['feat_ids', 'feat_vals', 'label'], | |||||
| num_parallel_workers=8) | |||||
| ds = ds.repeat(epochs) | |||||
| return ds | |||||
| data_set = ds.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), | |||||
| columns_list=['feat_ids', 'feat_vals', 'label'], | |||||
| shuffle=shuffle, num_parallel_workers=8) | |||||
| data_set = data_set.batch(int(batch_size / line_per_sample), drop_remainder=True) | |||||
| data_set = data_set.map(_padding_func(batch_size, manual_shape, target_column), | |||||
| input_columns=['feat_ids', 'feat_vals', 'label'], | |||||
| column_order=['feat_ids', 'feat_vals', 'label'], | |||||
| num_parallel_workers=8) | |||||
| data_set = data_set.repeat(epochs) | |||||
| return data_set | |||||
| def _get_vocab_size(target_column_number, worker_size, total_vocab_size, multiply=False, per_vocab_size=None): | def _get_vocab_size(target_column_number, worker_size, total_vocab_size, multiply=False, per_vocab_size=None): | ||||
| @@ -284,7 +282,7 @@ def _get_vocab_size(target_column_number, worker_size, total_vocab_size, multipl | |||||
| 5, 21762, 14, 15, 15030, 61, 12220] | 5, 21762, 14, 15, 15030, 61, 12220] | ||||
| new_vocabs = inidival_vocabs + [1] * \ | new_vocabs = inidival_vocabs + [1] * \ | ||||
| (target_column_number - len(inidival_vocabs)) | |||||
| (target_column_number - len(inidival_vocabs)) | |||||
| part_size = int(target_column_number / worker_size) | part_size = int(target_column_number / worker_size) | ||||
| # According to the workers, we merge some fields into the same part | # According to the workers, we merge some fields into the same part | ||||
| @@ -304,21 +302,21 @@ def _get_vocab_size(target_column_number, worker_size, total_vocab_size, multipl | |||||
| # Expands the vocabulary of each field by the multiplier | # Expands the vocabulary of each field by the multiplier | ||||
| if multiply is True: | if multiply is True: | ||||
| cur_sum = sum(new_vocab_size) | cur_sum = sum(new_vocab_size) | ||||
| k = total_vocab_size/cur_sum | |||||
| k = total_vocab_size / cur_sum | |||||
| new_vocab_size = [ | new_vocab_size = [ | ||||
| math.ceil(int(item*k)/worker_size)*worker_size for item in new_vocab_size] | |||||
| new_vocab_size = [(item // 8 + 1)*8 for item in new_vocab_size] | |||||
| math.ceil(int(item * k) / worker_size) * worker_size for item in new_vocab_size] | |||||
| new_vocab_size = [(item // 8 + 1) * 8 for item in new_vocab_size] | |||||
| else: | else: | ||||
| if total_vocab_size > sum(new_vocab_size): | if total_vocab_size > sum(new_vocab_size): | ||||
| new_vocab_size[-1] = total_vocab_size - \ | new_vocab_size[-1] = total_vocab_size - \ | ||||
| sum(new_vocab_size[:-1]) | |||||
| sum(new_vocab_size[:-1]) | |||||
| new_vocab_size = [item for item in new_vocab_size] | new_vocab_size = [item for item in new_vocab_size] | ||||
| else: | else: | ||||
| raise ValueError( | raise ValueError( | ||||
| "Please providede the correct vocab size, now is {}".format(total_vocab_size)) | "Please providede the correct vocab size, now is {}".format(total_vocab_size)) | ||||
| for i in range(worker_size-1): | |||||
| for i in range(worker_size - 1): | |||||
| off = index_offsets[i] + features[i] | off = index_offsets[i] + features[i] | ||||
| index_offsets.append(off) | index_offsets.append(off) | ||||
| @@ -17,7 +17,7 @@ | |||||
| import os | import os | ||||
| import sys | import sys | ||||
| import mindspore.dataset.engine as de | |||||
| import mindspore.dataset as ds | |||||
| from mindspore import Model, context | from mindspore import Model, context | ||||
| from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor | from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor | ||||
| from mindspore.context import ParallelMode | from mindspore.context import ParallelMode | ||||
| @@ -88,7 +88,7 @@ def train_and_eval(config): | |||||
| print("epochs is {}".format(epochs)) | print("epochs is {}".format(epochs)) | ||||
| if config.full_batch: | if config.full_batch: | ||||
| context.set_auto_parallel_context(full_batch=True) | context.set_auto_parallel_context(full_batch=True) | ||||
| de.config.set_seed(1) | |||||
| ds.config.set_seed(1) | |||||
| if config.field_slice: | if config.field_slice: | ||||
| compute_manual_shape(config, get_group_size()) | compute_manual_shape(config, get_group_size()) | ||||
| ds_train = create_dataset(data_path, train_mode=True, epochs=1, | ds_train = create_dataset(data_path, train_mode=True, epochs=1, | ||||
| @@ -17,7 +17,7 @@ | |||||
| import os | import os | ||||
| import sys | import sys | ||||
| import mindspore.dataset.engine as de | |||||
| import mindspore.dataset as ds | |||||
| from mindspore import Model, context | from mindspore import Model, context | ||||
| from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor | from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor | ||||
| from mindspore.context import ParallelMode | from mindspore.context import ParallelMode | ||||
| @@ -92,7 +92,7 @@ def train_and_eval(config): | |||||
| print("epochs is {}".format(epochs)) | print("epochs is {}".format(epochs)) | ||||
| if config.full_batch: | if config.full_batch: | ||||
| context.set_auto_parallel_context(full_batch=True) | context.set_auto_parallel_context(full_batch=True) | ||||
| de.config.set_seed(1) | |||||
| ds.config.set_seed(1) | |||||
| ds_train = create_dataset(data_path, train_mode=True, epochs=1, | ds_train = create_dataset(data_path, train_mode=True, epochs=1, | ||||
| batch_size=batch_size*get_group_size(), data_type=dataset_type) | batch_size=batch_size*get_group_size(), data_type=dataset_type) | ||||
| ds_eval = create_dataset(data_path, train_mode=False, epochs=1, | ds_eval = create_dataset(data_path, train_mode=False, epochs=1, | ||||
| @@ -18,7 +18,7 @@ import math | |||||
| import pickle | import pickle | ||||
| import numpy as np | import numpy as np | ||||
| import pandas as pd | import pandas as pd | ||||
| import mindspore.dataset.engine as de | |||||
| import mindspore.dataset as ds | |||||
| import mindspore.common.dtype as mstype | import mindspore.common.dtype as mstype | ||||
| @@ -97,8 +97,7 @@ class H5Dataset(): | |||||
| np.random.shuffle(sample_index) | np.random.shuffle(sample_index) | ||||
| assert X.shape[0] > 0 | assert X.shape[0] > 0 | ||||
| while True: | while True: | ||||
| batch_index = sample_index[batch_size * counter:batch_size * | |||||
| (counter + 1)] | |||||
| batch_index = sample_index[batch_size * counter:batch_size * (counter + 1)] | |||||
| X_batch = X[batch_index] | X_batch = X[batch_index] | ||||
| y_batch = y[batch_index] | y_batch = y[batch_index] | ||||
| counter += 1 | counter += 1 | ||||
| @@ -135,9 +134,8 @@ class H5Dataset(): | |||||
| X, y, finished = data_gen.__next__() | X, y, finished = data_gen.__next__() | ||||
| X_id = X[:, 0:self.input_length] | X_id = X[:, 0:self.input_length] | ||||
| X_va = X[:, self.input_length:] | X_va = X[:, self.input_length:] | ||||
| yield np.array(X_id.astype(dtype=np.int32)), np.array( | |||||
| X_va.astype(dtype=np.float32)), np.array( | |||||
| y.astype(dtype=np.float32)) | |||||
| yield np.array(X_id.astype(dtype=np.int32)), np.array(X_va.astype(dtype=np.float32)), np.array( | |||||
| y.astype(dtype=np.float32)) | |||||
| def _get_h5_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000): | def _get_h5_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000): | ||||
| @@ -159,10 +157,10 @@ def _get_h5_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000): | |||||
| for _ in range(0, numbers_of_batch, 1): | for _ in range(0, numbers_of_batch, 1): | ||||
| yield train_eval_gen.__next__() | yield train_eval_gen.__next__() | ||||
| ds = de.GeneratorDataset(_iter_h5_data(), | |||||
| ["ids", "weights", "labels"]) | |||||
| ds = ds.repeat(epochs) | |||||
| return ds | |||||
| data_set = ds.GeneratorDataset(_iter_h5_data(), | |||||
| ["ids", "weights", "labels"]) | |||||
| data_set = data_set.repeat(epochs) | |||||
| return data_set | |||||
| def _get_tf_dataset(data_dir, | def _get_tf_dataset(data_dir, | ||||
| @@ -184,7 +182,7 @@ def _get_tf_dataset(data_dir, | |||||
| for filename in filenames: | for filename in filenames: | ||||
| if file_prefix_name in filename and "tfrecord" in filename: | if file_prefix_name in filename and "tfrecord" in filename: | ||||
| dataset_files.append(os.path.join(dirpath, filename)) | dataset_files.append(os.path.join(dirpath, filename)) | ||||
| schema = de.Schema() | |||||
| schema = ds.Schema() | |||||
| float_key_list = ["label", "continue_val"] | float_key_list = ["label", "continue_val"] | ||||
| @@ -199,19 +197,19 @@ def _get_tf_dataset(data_dir, | |||||
| schema.add_column(key, de_type=ms_dtype) | schema.add_column(key, de_type=ms_dtype) | ||||
| if rank_size is not None and rank_id is not None: | if rank_size is not None and rank_id is not None: | ||||
| ds = de.TFRecordDataset(dataset_files=dataset_files, | |||||
| shuffle=shuffle, | |||||
| schema=schema, | |||||
| num_parallel_workers=8, | |||||
| num_shards=rank_size, | |||||
| shard_id=rank_id, | |||||
| shard_equal_rows=True) | |||||
| data_set = ds.TFRecordDataset(dataset_files=dataset_files, | |||||
| shuffle=shuffle, | |||||
| schema=schema, | |||||
| num_parallel_workers=8, | |||||
| num_shards=rank_size, | |||||
| shard_id=rank_id, | |||||
| shard_equal_rows=True) | |||||
| else: | else: | ||||
| ds = de.TFRecordDataset(dataset_files=dataset_files, | |||||
| shuffle=shuffle, | |||||
| schema=schema, | |||||
| num_parallel_workers=8) | |||||
| ds = ds.batch(int(batch_size / line_per_sample), drop_remainder=True) | |||||
| data_set = ds.TFRecordDataset(dataset_files=dataset_files, | |||||
| shuffle=shuffle, | |||||
| schema=schema, | |||||
| num_parallel_workers=8) | |||||
| data_set = data_set.batch(int(batch_size / line_per_sample), drop_remainder=True) | |||||
| operations_list = [] | operations_list = [] | ||||
| for key in columns_list: | for key in columns_list: | ||||
| @@ -249,7 +247,7 @@ def _get_tf_dataset(data_dir, | |||||
| u = np.array(u).flatten().reshape(batch_size, -1) | u = np.array(u).flatten().reshape(batch_size, -1) | ||||
| return a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u | return a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u | ||||
| ds = ds.map( | |||||
| data_set = data_set.map( | |||||
| operations=mixup, | operations=mixup, | ||||
| input_columns=[ | input_columns=[ | ||||
| 'label', 'continue_val', 'indicator_id', 'emb_128_id', | 'label', 'continue_val', 'indicator_id', 'emb_128_id', | ||||
| @@ -275,8 +273,8 @@ def _get_tf_dataset(data_dir, | |||||
| ], | ], | ||||
| num_parallel_workers=8) | num_parallel_workers=8) | ||||
| ds = ds.repeat(epochs) | |||||
| return ds | |||||
| data_set = data_set.repeat(epochs) | |||||
| return data_set | |||||
| def compute_emb_dim(config): | def compute_emb_dim(config): | ||||
| @@ -24,16 +24,17 @@ import cv2 | |||||
| import numpy as np | import numpy as np | ||||
| import pycocotools.coco as coco | import pycocotools.coco as coco | ||||
| import mindspore.dataset.engine.datasets as de | |||||
| import mindspore.dataset as ds | |||||
| from mindspore import log as logger | from mindspore import log as logger | ||||
| from mindspore.mindrecord import FileWriter | from mindspore.mindrecord import FileWriter | ||||
| from src.image import color_aug, get_affine_transform, affine_transform | from src.image import color_aug, get_affine_transform, affine_transform | ||||
| from src.image import gaussian_radius, draw_umich_gaussian, draw_msra_gaussian, draw_dense_reg | from src.image import gaussian_radius, draw_umich_gaussian, draw_msra_gaussian, draw_dense_reg | ||||
| from src.visual import visual_image | from src.visual import visual_image | ||||
| _current_dir = os.path.dirname(os.path.realpath(__file__)) | _current_dir = os.path.dirname(os.path.realpath(__file__)) | ||||
| class COCOHP(de.Dataset): | |||||
| class COCOHP(ds.Dataset): | |||||
| """ | """ | ||||
| Encapsulation class of COCO person keypoints datast. | Encapsulation class of COCO person keypoints datast. | ||||
| Initilize and preprocess of image for training and testing. | Initilize and preprocess of image for training and testing. | ||||
| @@ -47,6 +48,7 @@ class COCOHP(de.Dataset): | |||||
| Returns: | Returns: | ||||
| Prepocessed training or testing dataset for CenterNet network. | Prepocessed training or testing dataset for CenterNet network. | ||||
| """ | """ | ||||
| def __init__(self, data_opt, run_mode="train", net_opt=None, enable_visual_image=False, save_path=None): | def __init__(self, data_opt, run_mode="train", net_opt=None, enable_visual_image=False, save_path=None): | ||||
| super(COCOHP, self).__init__() | super(COCOHP, self).__init__() | ||||
| self._data_rng = np.random.RandomState(123) | self._data_rng = np.random.RandomState(123) | ||||
| @@ -64,7 +66,6 @@ class COCOHP(de.Dataset): | |||||
| if not os.path.exists(self.save_path): | if not os.path.exists(self.save_path): | ||||
| os.makedirs(self.save_path) | os.makedirs(self.save_path) | ||||
| def init(self, data_dir, keep_res=False, flip_test=False): | def init(self, data_dir, keep_res=False, flip_test=False): | ||||
| """initailize additional info""" | """initailize additional info""" | ||||
| logger.info('Initializing coco 2017 {} data.'.format(self.run_mode)) | logger.info('Initializing coco 2017 {} data.'.format(self.run_mode)) | ||||
| @@ -124,7 +125,7 @@ class COCOHP(de.Dataset): | |||||
| for img_id in self.images: | for img_id in self.images: | ||||
| image_info = self.coco.loadImgs([img_id]) | image_info = self.coco.loadImgs([img_id]) | ||||
| annos = self.coco.loadAnns(self.anns[img_id]) | annos = self.coco.loadAnns(self.anns[img_id]) | ||||
| #get image | |||||
| # get image | |||||
| img_name = image_info[0]['file_name'] | img_name = image_info[0]['file_name'] | ||||
| img_name = os.path.join(self.image_path, img_name) | img_name = os.path.join(self.image_path, img_name) | ||||
| with open(img_name, 'rb') as f: | with open(img_name, 'rb') as f: | ||||
| @@ -147,19 +148,16 @@ class COCOHP(de.Dataset): | |||||
| writer.commit() | writer.commit() | ||||
| logger.info("Create Mindrecord Done, at {}".format(mindrecord_dir)) | logger.info("Create Mindrecord Done, at {}".format(mindrecord_dir)) | ||||
| def _coco_box_to_bbox(self, box): | def _coco_box_to_bbox(self, box): | ||||
| bbox = np.array([box[0], box[1], box[0] + box[2], box[1] + box[3]], dtype=np.float32) | bbox = np.array([box[0], box[1], box[0] + box[2], box[1] + box[3]], dtype=np.float32) | ||||
| return bbox | return bbox | ||||
| def _get_border(self, border, size): | def _get_border(self, border, size): | ||||
| i = 1 | i = 1 | ||||
| while size - border // i <= border // i: | while size - border // i <= border // i: | ||||
| i *= 2 | i *= 2 | ||||
| return border // i | return border // i | ||||
| def __getitem__(self, index): | def __getitem__(self, index): | ||||
| img_id = self.images[index] | img_id = self.images[index] | ||||
| file_name = self.coco.loadImgs(ids=[img_id])[0]['file_name'] | file_name = self.coco.loadImgs(ids=[img_id])[0]['file_name'] | ||||
| @@ -169,7 +167,6 @@ class COCOHP(de.Dataset): | |||||
| ret = (img, image_id) | ret = (img, image_id) | ||||
| return ret | return ret | ||||
| def pre_process_for_test(self, image, img_id, scale, meta=None): | def pre_process_for_test(self, image, img_id, scale, meta=None): | ||||
| """image pre-process for evaluation""" | """image pre-process for evaluation""" | ||||
| b, h, w, ch = image.shape | b, h, w, ch = image.shape | ||||
| @@ -249,7 +246,6 @@ class COCOHP(de.Dataset): | |||||
| return images, meta | return images, meta | ||||
| def preprocess_fn(self, img, num_objects, keypoints, bboxes, category_id): | def preprocess_fn(self, img, num_objects, keypoints, bboxes, category_id): | ||||
| """image pre-process and augmentation""" | """image pre-process and augmentation""" | ||||
| num_objs = min(num_objects, self.data_opt.max_objs) | num_objs = min(num_objects, self.data_opt.max_objs) | ||||
| @@ -269,12 +265,12 @@ class COCOHP(de.Dataset): | |||||
| else: | else: | ||||
| sf = self.data_opt.scale | sf = self.data_opt.scale | ||||
| cf = self.data_opt.shift | cf = self.data_opt.shift | ||||
| c[0] += s * np.clip(np.random.randn()*cf, -2*cf, 2*cf) | |||||
| c[1] += s * np.clip(np.random.randn()*cf, -2*cf, 2*cf) | |||||
| s = s * np.clip(np.random.randn()*sf + 1, 1 - sf, 1 + sf) | |||||
| c[0] += s * np.clip(np.random.randn() * cf, -2 * cf, 2 * cf) | |||||
| c[1] += s * np.clip(np.random.randn() * cf, -2 * cf, 2 * cf) | |||||
| s = s * np.clip(np.random.randn() * sf + 1, 1 - sf, 1 + sf) | |||||
| if np.random.random() < self.data_opt.aug_rot: | if np.random.random() < self.data_opt.aug_rot: | ||||
| rf = self.data_opt.rotate | rf = self.data_opt.rotate | ||||
| rot = np.clip(np.random.randn()*rf, -rf*2, rf*2) | |||||
| rot = np.clip(np.random.randn() * rf, -rf * 2, rf * 2) | |||||
| if np.random.random() < self.data_opt.flip_prop: | if np.random.random() < self.data_opt.flip_prop: | ||||
| flipped = True | flipped = True | ||||
| @@ -323,7 +319,7 @@ class COCOHP(de.Dataset): | |||||
| cls_id = int(category_id[k]) - 1 | cls_id = int(category_id[k]) - 1 | ||||
| pts = np.array(keypoints[k], np.float32).reshape(num_joints, 3) | pts = np.array(keypoints[k], np.float32).reshape(num_joints, 3) | ||||
| if flipped: | if flipped: | ||||
| bbox[[0, 2]] = width - bbox[[2, 0]] - 1 # index begin from zero | |||||
| bbox[[0, 2]] = width - bbox[[2, 0]] - 1 # index begin from zero | |||||
| pts[:, 0] = width - pts[:, 0] - 1 | pts[:, 0] = width - pts[:, 0] - 1 | ||||
| for e in self.data_opt.flip_idx: | for e in self.data_opt.flip_idx: | ||||
| pts[e[0]], pts[e[1]] = pts[e[1]].copy(), pts[e[0]].copy() | pts[e[0]], pts[e[1]] = pts[e[1]].copy(), pts[e[0]].copy() | ||||
| @@ -360,7 +356,7 @@ class COCOHP(de.Dataset): | |||||
| if pts[j, 2] > 0: | if pts[j, 2] > 0: | ||||
| pts[j, :2] = affine_transform(pts[j, :2], trans_output_rot) | pts[j, :2] = affine_transform(pts[j, :2], trans_output_rot) | ||||
| if pts[j, 0] >= 0 and pts[j, 0] < output_res and \ | if pts[j, 0] >= 0 and pts[j, 0] < output_res and \ | ||||
| pts[j, 1] >= 0 and pts[j, 1] < output_res: | |||||
| pts[j, 1] >= 0 and pts[j, 1] < output_res: | |||||
| kps[k, j * 2: j * 2 + 2] = pts[j, :2] - ct_int | kps[k, j * 2: j * 2 + 2] = pts[j, :2] - ct_int | ||||
| kps_mask[k, j * 2: j * 2 + 2] = 1 | kps_mask[k, j * 2: j * 2 + 2] = 1 | ||||
| pt_int = pts[j, :2].astype(np.int32) | pt_int = pts[j, :2].astype(np.int32) | ||||
| @@ -399,7 +395,6 @@ class COCOHP(de.Dataset): | |||||
| visual_image(out_img, ground_truth, self.save_path, ratio=self.data_opt.input_res[0] // output_res) | visual_image(out_img, ground_truth, self.save_path, ratio=self.data_opt.input_res[0] // output_res) | ||||
| return ret | return ret | ||||
| def create_train_dataset(self, mindrecord_dir, prefix="coco_hp.train.mind", batch_size=1, | def create_train_dataset(self, mindrecord_dir, prefix="coco_hp.train.mind", batch_size=1, | ||||
| device_num=1, rank=0, num_parallel_workers=1, do_shuffle=True): | device_num=1, rank=0, num_parallel_workers=1, do_shuffle=True): | ||||
| """create train dataset based on mindrecord file""" | """create train dataset based on mindrecord file""" | ||||
| @@ -415,41 +410,43 @@ class COCOHP(de.Dataset): | |||||
| raise ValueError('data_dir {} have no data files'.format(mindrecord_dir)) | raise ValueError('data_dir {} have no data files'.format(mindrecord_dir)) | ||||
| columns = ["image", "num_objects", "keypoints", "bbox", "category_id"] | columns = ["image", "num_objects", "keypoints", "bbox", "category_id"] | ||||
| ds = de.MindDataset(data_files, | |||||
| columns_list=columns, | |||||
| num_parallel_workers=num_parallel_workers, shuffle=do_shuffle, | |||||
| num_shards=device_num, shard_id=rank) | |||||
| ori_dataset_size = ds.get_dataset_size() | |||||
| data_set = ds.MindDataset(data_files, | |||||
| columns_list=columns, | |||||
| num_parallel_workers=num_parallel_workers, shuffle=do_shuffle, | |||||
| num_shards=device_num, shard_id=rank) | |||||
| ori_dataset_size = data_set.get_dataset_size() | |||||
| logger.info('origin dataset size: {}'.format(ori_dataset_size)) | logger.info('origin dataset size: {}'.format(ori_dataset_size)) | ||||
| ds = ds.map(operations=self.preprocess_fn, | |||||
| input_columns=["image", "num_objects", "keypoints", "bbox", "category_id"], | |||||
| output_columns=["image", "hm", "reg_mask", "ind", "wh", "kps", "kps_mask", | |||||
| "reg", "hm_hp", "hp_offset", "hp_ind", "hp_mask"], | |||||
| column_order=["image", "hm", "reg_mask", "ind", "wh", "kps", "kps_mask", | |||||
| "reg", "hm_hp", "hp_offset", "hp_ind", "hp_mask"], | |||||
| num_parallel_workers=num_parallel_workers, | |||||
| python_multiprocessing=True) | |||||
| ds = ds.batch(batch_size, drop_remainder=True, num_parallel_workers=8) | |||||
| logger.info("data size: {}".format(ds.get_dataset_size())) | |||||
| logger.info("repeat count: {}".format(ds.get_repeat_count())) | |||||
| return ds | |||||
| data_set = data_set.map(operations=self.preprocess_fn, | |||||
| input_columns=["image", "num_objects", "keypoints", "bbox", "category_id"], | |||||
| output_columns=["image", "hm", "reg_mask", "ind", "wh", "kps", "kps_mask", | |||||
| "reg", "hm_hp", "hp_offset", "hp_ind", "hp_mask"], | |||||
| column_order=["image", "hm", "reg_mask", "ind", "wh", "kps", "kps_mask", | |||||
| "reg", "hm_hp", "hp_offset", "hp_ind", "hp_mask"], | |||||
| num_parallel_workers=num_parallel_workers, | |||||
| python_multiprocessing=True) | |||||
| data_set = data_set.batch(batch_size, drop_remainder=True, num_parallel_workers=8) | |||||
| logger.info("data size: {}".format(data_set.get_dataset_size())) | |||||
| logger.info("repeat count: {}".format(data_set.get_repeat_count())) | |||||
| return data_set | |||||
| def create_eval_dataset(self, batch_size=1, num_parallel_workers=1): | def create_eval_dataset(self, batch_size=1, num_parallel_workers=1): | ||||
| """create testing dataset based on coco format""" | """create testing dataset based on coco format""" | ||||
| def generator(): | def generator(): | ||||
| for i in range(self.num_samples): | for i in range(self.num_samples): | ||||
| yield self.__getitem__(i) | yield self.__getitem__(i) | ||||
| column = ["image", "image_id"] | column = ["image", "image_id"] | ||||
| ds = de.GeneratorDataset(generator, column, num_parallel_workers=num_parallel_workers) | |||||
| ds = ds.batch(batch_size, drop_remainder=True, num_parallel_workers=8) | |||||
| return ds | |||||
| data_set = ds.GeneratorDataset(generator, column, num_parallel_workers=num_parallel_workers) | |||||
| data_set = data_set.batch(batch_size, drop_remainder=True, num_parallel_workers=8) | |||||
| return data_set | |||||
| if __name__ == '__main__': | if __name__ == '__main__': | ||||
| # Convert coco2017 dataset to mindrecord to improve performance on host | # Convert coco2017 dataset to mindrecord to improve performance on host | ||||
| from src.config import dataset_config | from src.config import dataset_config | ||||
| parser = argparse.ArgumentParser(description='CenterNet MindRecord dataset') | parser = argparse.ArgumentParser(description='CenterNet MindRecord dataset') | ||||
| parser.add_argument("--coco_data_dir", type=str, default="", help="Coco dataset directory.") | parser.add_argument("--coco_data_dir", type=str, default="", help="Coco dataset directory.") | ||||
| parser.add_argument("--mindrecord_dir", type=str, default="", help="MindRecord dataset dir.") | parser.add_argument("--mindrecord_dir", type=str, default="", help="MindRecord dataset dir.") | ||||
| @@ -17,7 +17,7 @@ create train or eval dataset. | |||||
| """ | """ | ||||
| import os | import os | ||||
| import mindspore.common.dtype as mstype | import mindspore.common.dtype as mstype | ||||
| import mindspore.dataset.engine as de | |||||
| import mindspore.dataset as ds | |||||
| import mindspore.dataset.transforms.vision.c_transforms as C | import mindspore.dataset.transforms.vision.c_transforms as C | ||||
| import mindspore.dataset.transforms.vision.py_transforms as P | import mindspore.dataset.transforms.vision.py_transforms as P | ||||
| import mindspore.dataset.transforms.c_transforms as C2 | import mindspore.dataset.transforms.c_transforms as C2 | ||||
| @@ -41,18 +41,18 @@ def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch | |||||
| rank_size = int(os.getenv("RANK_SIZE")) | rank_size = int(os.getenv("RANK_SIZE")) | ||||
| rank_id = int(os.getenv("RANK_ID")) | rank_id = int(os.getenv("RANK_ID")) | ||||
| if rank_size == 1: | if rank_size == 1: | ||||
| ds = de.MindDataset( | |||||
| data_set = ds.MindDataset( | |||||
| dataset_path, num_parallel_workers=8, shuffle=True) | dataset_path, num_parallel_workers=8, shuffle=True) | ||||
| else: | else: | ||||
| ds = de.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||||
| num_shards=rank_size, shard_id=rank_id) | |||||
| data_set = ds.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||||
| num_shards=rank_size, shard_id=rank_id) | |||||
| elif platform == "GPU": | elif platform == "GPU": | ||||
| if do_train: | if do_train: | ||||
| from mindspore.communication.management import get_rank, get_group_size | from mindspore.communication.management import get_rank, get_group_size | ||||
| ds = de.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||||
| num_shards=get_group_size(), shard_id=get_rank()) | |||||
| data_set = ds.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||||
| num_shards=get_group_size(), shard_id=get_rank()) | |||||
| else: | else: | ||||
| ds = de.MindDataset( | |||||
| data_set = ds.MindDataset( | |||||
| dataset_path, num_parallel_workers=8, shuffle=True) | dataset_path, num_parallel_workers=8, shuffle=True) | ||||
| else: | else: | ||||
| raise ValueError("Unsupport platform.") | raise ValueError("Unsupport platform.") | ||||
| @@ -67,7 +67,7 @@ def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch | |||||
| color_op = C.RandomColorAdjust( | color_op = C.RandomColorAdjust( | ||||
| brightness=0.4, contrast=0.4, saturation=0.4) | brightness=0.4, contrast=0.4, saturation=0.4) | ||||
| rescale_op = C.Rescale(1/255.0, 0) | |||||
| rescale_op = C.Rescale(1 / 255.0, 0) | |||||
| normalize_op = C.Normalize( | normalize_op = C.Normalize( | ||||
| mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) | mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) | ||||
| change_swap_op = C.HWC2CHW() | change_swap_op = C.HWC2CHW() | ||||
| @@ -93,18 +93,18 @@ def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch | |||||
| trans = composeop() | trans = composeop() | ||||
| type_cast_op = C2.TypeCast(mstype.int32) | type_cast_op = C2.TypeCast(mstype.int32) | ||||
| ds = ds.map(input_columns="image", operations=trans, | |||||
| num_parallel_workers=8) | |||||
| ds = ds.map(input_columns="label_list", | |||||
| operations=type_cast_op, num_parallel_workers=8) | |||||
| data_set = data_set.map(input_columns="image", operations=trans, | |||||
| num_parallel_workers=8) | |||||
| data_set = data_set.map(input_columns="label_list", | |||||
| operations=type_cast_op, num_parallel_workers=8) | |||||
| # apply shuffle operations | # apply shuffle operations | ||||
| ds = ds.shuffle(buffer_size=buffer_size) | |||||
| data_set = data_set.shuffle(buffer_size=buffer_size) | |||||
| # apply batch operations | # apply batch operations | ||||
| ds = ds.batch(batch_size, drop_remainder=True) | |||||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||||
| # apply dataset repeat operation | # apply dataset repeat operation | ||||
| ds = ds.repeat(repeat_num) | |||||
| data_set = data_set.repeat(repeat_num) | |||||
| return ds | |||||
| return data_set | |||||
| @@ -17,7 +17,7 @@ create train or eval dataset. | |||||
| """ | """ | ||||
| import os | import os | ||||
| import mindspore.common.dtype as mstype | import mindspore.common.dtype as mstype | ||||
| import mindspore.dataset.engine as de | |||||
| import mindspore.dataset as ds | |||||
| import mindspore.dataset.transforms.vision.c_transforms as C | import mindspore.dataset.transforms.vision.c_transforms as C | ||||
| import mindspore.dataset.transforms.vision.py_transforms as P | import mindspore.dataset.transforms.vision.py_transforms as P | ||||
| import mindspore.dataset.transforms.c_transforms as C2 | import mindspore.dataset.transforms.c_transforms as C2 | ||||
| @@ -41,18 +41,18 @@ def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch | |||||
| rank_size = int(os.getenv("RANK_SIZE")) | rank_size = int(os.getenv("RANK_SIZE")) | ||||
| rank_id = int(os.getenv("RANK_ID")) | rank_id = int(os.getenv("RANK_ID")) | ||||
| if rank_size == 1: | if rank_size == 1: | ||||
| ds = de.MindDataset( | |||||
| data_set = ds.MindDataset( | |||||
| dataset_path, num_parallel_workers=8, shuffle=True) | dataset_path, num_parallel_workers=8, shuffle=True) | ||||
| else: | else: | ||||
| ds = de.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||||
| num_shards=rank_size, shard_id=rank_id) | |||||
| data_set = ds.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||||
| num_shards=rank_size, shard_id=rank_id) | |||||
| elif platform == "GPU": | elif platform == "GPU": | ||||
| if do_train: | if do_train: | ||||
| from mindspore.communication.management import get_rank, get_group_size | from mindspore.communication.management import get_rank, get_group_size | ||||
| ds = de.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||||
| num_shards=get_group_size(), shard_id=get_rank()) | |||||
| data_set = ds.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||||
| num_shards=get_group_size(), shard_id=get_rank()) | |||||
| else: | else: | ||||
| ds = de.MindDataset( | |||||
| data_set = ds.MindDataset( | |||||
| dataset_path, num_parallel_workers=8, shuffle=True) | dataset_path, num_parallel_workers=8, shuffle=True) | ||||
| else: | else: | ||||
| raise ValueError("Unsupport platform.") | raise ValueError("Unsupport platform.") | ||||
| @@ -67,7 +67,7 @@ def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch | |||||
| color_op = C.RandomColorAdjust( | color_op = C.RandomColorAdjust( | ||||
| brightness=0.4, contrast=0.4, saturation=0.4) | brightness=0.4, contrast=0.4, saturation=0.4) | ||||
| rescale_op = C.Rescale(1/255.0, 0) | |||||
| rescale_op = C.Rescale(1 / 255.0, 0) | |||||
| normalize_op = C.Normalize( | normalize_op = C.Normalize( | ||||
| mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) | mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) | ||||
| change_swap_op = C.HWC2CHW() | change_swap_op = C.HWC2CHW() | ||||
| @@ -93,18 +93,18 @@ def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch | |||||
| trans = composeop() | trans = composeop() | ||||
| type_cast_op = C2.TypeCast(mstype.int32) | type_cast_op = C2.TypeCast(mstype.int32) | ||||
| ds = ds.map(input_columns="image", operations=trans, | |||||
| num_parallel_workers=8) | |||||
| ds = ds.map(input_columns="label_list", | |||||
| operations=type_cast_op, num_parallel_workers=8) | |||||
| data_set = data_set.map(input_columns="image", operations=trans, | |||||
| num_parallel_workers=8) | |||||
| data_set = data_set.map(input_columns="label_list", | |||||
| operations=type_cast_op, num_parallel_workers=8) | |||||
| # apply shuffle operations | # apply shuffle operations | ||||
| ds = ds.shuffle(buffer_size=buffer_size) | |||||
| data_set = data_set.shuffle(buffer_size=buffer_size) | |||||
| # apply batch operations | # apply batch operations | ||||
| ds = ds.batch(batch_size, drop_remainder=True) | |||||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||||
| # apply dataset repeat operation | # apply dataset repeat operation | ||||
| ds = ds.repeat(repeat_num) | |||||
| data_set = data_set.repeat(repeat_num) | |||||
| return ds | |||||
| return data_set | |||||
| @@ -17,7 +17,7 @@ create train or eval dataset. | |||||
| """ | """ | ||||
| import os | import os | ||||
| import mindspore.common.dtype as mstype | import mindspore.common.dtype as mstype | ||||
| import mindspore.dataset.engine as de | |||||
| import mindspore.dataset as ds | |||||
| import mindspore.dataset.vision.c_transforms as C | import mindspore.dataset.vision.c_transforms as C | ||||
| import mindspore.dataset.vision.py_transforms as P | import mindspore.dataset.vision.py_transforms as P | ||||
| import mindspore.dataset.transforms.c_transforms as C2 | import mindspore.dataset.transforms.c_transforms as C2 | ||||
| @@ -42,18 +42,18 @@ def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch | |||||
| rank_size = int(os.getenv("RANK_SIZE")) | rank_size = int(os.getenv("RANK_SIZE")) | ||||
| rank_id = int(os.getenv("RANK_ID")) | rank_id = int(os.getenv("RANK_ID")) | ||||
| if rank_size == 1: | if rank_size == 1: | ||||
| ds = de.MindDataset( | |||||
| data_set = ds.MindDataset( | |||||
| dataset_path, num_parallel_workers=8, shuffle=True) | dataset_path, num_parallel_workers=8, shuffle=True) | ||||
| else: | else: | ||||
| ds = de.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||||
| num_shards=rank_size, shard_id=rank_id) | |||||
| data_set = ds.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||||
| num_shards=rank_size, shard_id=rank_id) | |||||
| elif platform == "GPU": | elif platform == "GPU": | ||||
| if do_train: | if do_train: | ||||
| from mindspore.communication.management import get_rank, get_group_size | from mindspore.communication.management import get_rank, get_group_size | ||||
| ds = de.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||||
| num_shards=get_group_size(), shard_id=get_rank()) | |||||
| data_set = ds.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||||
| num_shards=get_group_size(), shard_id=get_rank()) | |||||
| else: | else: | ||||
| ds = de.MindDataset( | |||||
| data_set = ds.MindDataset( | |||||
| dataset_path, num_parallel_workers=8, shuffle=False) | dataset_path, num_parallel_workers=8, shuffle=False) | ||||
| else: | else: | ||||
| raise ValueError("Unsupport platform.") | raise ValueError("Unsupport platform.") | ||||
| @@ -68,7 +68,7 @@ def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch | |||||
| color_op = C.RandomColorAdjust( | color_op = C.RandomColorAdjust( | ||||
| brightness=0.4, contrast=0.4, saturation=0.4) | brightness=0.4, contrast=0.4, saturation=0.4) | ||||
| rescale_op = C.Rescale(1/255.0, 0) | |||||
| rescale_op = C.Rescale(1 / 255.0, 0) | |||||
| normalize_op = C.Normalize( | normalize_op = C.Normalize( | ||||
| mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) | mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) | ||||
| change_swap_op = C.HWC2CHW() | change_swap_op = C.HWC2CHW() | ||||
| @@ -88,18 +88,18 @@ def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch | |||||
| trans = composeop | trans = composeop | ||||
| type_cast_op = C2.TypeCast(mstype.int32) | type_cast_op = C2.TypeCast(mstype.int32) | ||||
| ds = ds.map(input_columns="image", operations=trans, | |||||
| num_parallel_workers=8) | |||||
| ds = ds.map(input_columns="label_list", | |||||
| operations=type_cast_op, num_parallel_workers=8) | |||||
| data_set = data_set.map(input_columns="image", operations=trans, | |||||
| num_parallel_workers=8) | |||||
| data_set = data_set.map(input_columns="label_list", | |||||
| operations=type_cast_op, num_parallel_workers=8) | |||||
| # apply shuffle operations | # apply shuffle operations | ||||
| ds = ds.shuffle(buffer_size=buffer_size) | |||||
| data_set = data_set.shuffle(buffer_size=buffer_size) | |||||
| # apply batch operations | # apply batch operations | ||||
| ds = ds.batch(batch_size, drop_remainder=True) | |||||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||||
| # apply dataset repeat operation | # apply dataset repeat operation | ||||
| ds = ds.repeat(repeat_num) | |||||
| data_set = data_set.repeat(repeat_num) | |||||
| return ds | |||||
| return data_set | |||||
| @@ -17,7 +17,7 @@ create train or eval dataset. | |||||
| """ | """ | ||||
| import os | import os | ||||
| import mindspore.common.dtype as mstype | import mindspore.common.dtype as mstype | ||||
| import mindspore.dataset.engine as de | |||||
| import mindspore.dataset as ds | |||||
| import mindspore.dataset.vision.c_transforms as C | import mindspore.dataset.vision.c_transforms as C | ||||
| import mindspore.dataset.transforms.c_transforms as C2 | import mindspore.dataset.transforms.c_transforms as C2 | ||||
| from mindspore.communication.management import init, get_rank, get_group_size | from mindspore.communication.management import init, get_rank, get_group_size | ||||
| @@ -48,15 +48,15 @@ def create_dataset_cifar(dataset_path, | |||||
| device_num = get_group_size() | device_num = get_group_size() | ||||
| if device_num == 1: | if device_num == 1: | ||||
| ds = de.Cifar10Dataset(dataset_path, | |||||
| num_parallel_workers=8, | |||||
| shuffle=True) | |||||
| data_set = ds.Cifar10Dataset(dataset_path, | |||||
| num_parallel_workers=8, | |||||
| shuffle=True) | |||||
| else: | else: | ||||
| ds = de.Cifar10Dataset(dataset_path, | |||||
| num_parallel_workers=8, | |||||
| shuffle=True, | |||||
| num_shards=device_num, | |||||
| shard_id=rank_id) | |||||
| data_set = ds.Cifar10Dataset(dataset_path, | |||||
| num_parallel_workers=8, | |||||
| shuffle=True, | |||||
| num_shards=device_num, | |||||
| shard_id=rank_id) | |||||
| # define map operations | # define map operations | ||||
| if do_train: | if do_train: | ||||
| @@ -80,20 +80,20 @@ def create_dataset_cifar(dataset_path, | |||||
| type_cast_op = C2.TypeCast(mstype.int32) | type_cast_op = C2.TypeCast(mstype.int32) | ||||
| ds = ds.map(operations=type_cast_op, | |||||
| input_columns="label", | |||||
| num_parallel_workers=8) | |||||
| ds = ds.map(operations=trans, | |||||
| input_columns="image", | |||||
| num_parallel_workers=8) | |||||
| data_set = data_set.map(operations=type_cast_op, | |||||
| input_columns="label", | |||||
| num_parallel_workers=8) | |||||
| data_set = data_set.map(operations=trans, | |||||
| input_columns="image", | |||||
| num_parallel_workers=8) | |||||
| # apply batch operations | # apply batch operations | ||||
| ds = ds.batch(batch_size, drop_remainder=True) | |||||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||||
| # apply dataset repeat operation | # apply dataset repeat operation | ||||
| ds = ds.repeat(repeat_num) | |||||
| data_set = data_set.repeat(repeat_num) | |||||
| return ds | |||||
| return data_set | |||||
| def create_dataset_imagenet(dataset_path, | def create_dataset_imagenet(dataset_path, | ||||
| @@ -122,15 +122,15 @@ def create_dataset_imagenet(dataset_path, | |||||
| device_num = get_group_size() | device_num = get_group_size() | ||||
| if device_num == 1: | if device_num == 1: | ||||
| ds = de.ImageFolderDataset(dataset_path, | |||||
| num_parallel_workers=8, | |||||
| shuffle=True) | |||||
| data_set = ds.ImageFolderDataset(dataset_path, | |||||
| num_parallel_workers=8, | |||||
| shuffle=True) | |||||
| else: | else: | ||||
| ds = de.ImageFolderDataset(dataset_path, | |||||
| num_parallel_workers=8, | |||||
| shuffle=True, | |||||
| num_shards=device_num, | |||||
| shard_id=rank_id) | |||||
| data_set = ds.ImageFolderDataset(dataset_path, | |||||
| num_parallel_workers=8, | |||||
| shuffle=True, | |||||
| num_shards=device_num, | |||||
| shard_id=rank_id) | |||||
| image_size = 227 | image_size = 227 | ||||
| mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] | mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] | ||||
| @@ -159,20 +159,20 @@ def create_dataset_imagenet(dataset_path, | |||||
| type_cast_op = C2.TypeCast(mstype.int32) | type_cast_op = C2.TypeCast(mstype.int32) | ||||
| ds = ds.map(operations=type_cast_op, | |||||
| input_columns="label", | |||||
| num_parallel_workers=8) | |||||
| ds = ds.map(operations=trans, | |||||
| input_columns="image", | |||||
| num_parallel_workers=8) | |||||
| data_set = data_set.map(operations=type_cast_op, | |||||
| input_columns="label", | |||||
| num_parallel_workers=8) | |||||
| data_set = data_set.map(operations=trans, | |||||
| input_columns="image", | |||||
| num_parallel_workers=8) | |||||
| # apply batch operations | # apply batch operations | ||||
| ds = ds.batch(batch_size, drop_remainder=True) | |||||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||||
| # apply dataset repeat operation | # apply dataset repeat operation | ||||
| ds = ds.repeat(repeat_num) | |||||
| data_set = data_set.repeat(repeat_num) | |||||
| return ds | |||||
| return data_set | |||||
| def _get_rank_info(): | def _get_rank_info(): | ||||
| @@ -21,7 +21,7 @@ from enum import Enum | |||||
| import numpy as np | import numpy as np | ||||
| import pandas as pd | import pandas as pd | ||||
| import mindspore.dataset.engine as de | |||||
| import mindspore.dataset as ds | |||||
| import mindspore.common.dtype as mstype | import mindspore.common.dtype as mstype | ||||
| from .config import DataConfig | from .config import DataConfig | ||||
| @@ -142,8 +142,8 @@ class H5Dataset(): | |||||
| X_id = X[:, 0:self.max_length] | X_id = X[:, 0:self.max_length] | ||||
| X_va = X[:, self.max_length:] | X_va = X[:, self.max_length:] | ||||
| yield np.array(X_id.astype(dtype=np.int32)), \ | yield np.array(X_id.astype(dtype=np.int32)), \ | ||||
| np.array(X_va.astype(dtype=np.float32)), \ | |||||
| np.array(y.astype(dtype=np.float32)) | |||||
| np.array(X_va.astype(dtype=np.float32)), \ | |||||
| np.array(y.astype(dtype=np.float32)) | |||||
| def _get_h5_dataset(directory, train_mode=True, epochs=1, batch_size=1000): | def _get_h5_dataset(directory, train_mode=True, epochs=1, batch_size=1000): | ||||
| @@ -172,9 +172,9 @@ def _get_h5_dataset(directory, train_mode=True, epochs=1, batch_size=1000): | |||||
| for _ in range(0, numbers_of_batch, 1): | for _ in range(0, numbers_of_batch, 1): | ||||
| yield train_eval_gen.__next__() | yield train_eval_gen.__next__() | ||||
| ds = de.GeneratorDataset(_iter_h5_data, ["ids", "weights", "labels"]) | |||||
| ds = ds.repeat(epochs) | |||||
| return ds | |||||
| data_set = ds.GeneratorDataset(_iter_h5_data, ["ids", "weights", "labels"]) | |||||
| data_set = data_set.repeat(epochs) | |||||
| return data_set | |||||
| def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=1000, | def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=1000, | ||||
| @@ -199,23 +199,23 @@ def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=100 | |||||
| shuffle = train_mode | shuffle = train_mode | ||||
| if rank_size is not None and rank_id is not None: | if rank_size is not None and rank_id is not None: | ||||
| ds = de.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), | |||||
| columns_list=['feat_ids', 'feat_vals', 'label'], | |||||
| num_shards=rank_size, shard_id=rank_id, shuffle=shuffle, | |||||
| num_parallel_workers=8) | |||||
| data_set = ds.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), | |||||
| columns_list=['feat_ids', 'feat_vals', 'label'], | |||||
| num_shards=rank_size, shard_id=rank_id, shuffle=shuffle, | |||||
| num_parallel_workers=8) | |||||
| else: | else: | ||||
| ds = de.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), | |||||
| columns_list=['feat_ids', 'feat_vals', 'label'], | |||||
| shuffle=shuffle, num_parallel_workers=8) | |||||
| ds = ds.batch(int(batch_size / line_per_sample), drop_remainder=True) | |||||
| ds = ds.map(operations=(lambda x, y, z: (np.array(x).flatten().reshape(batch_size, 39), | |||||
| np.array(y).flatten().reshape(batch_size, 39), | |||||
| np.array(z).flatten().reshape(batch_size, 1))), | |||||
| input_columns=['feat_ids', 'feat_vals', 'label'], | |||||
| column_order=['feat_ids', 'feat_vals', 'label'], | |||||
| num_parallel_workers=8) | |||||
| ds = ds.repeat(epochs) | |||||
| return ds | |||||
| data_set = ds.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), | |||||
| columns_list=['feat_ids', 'feat_vals', 'label'], | |||||
| shuffle=shuffle, num_parallel_workers=8) | |||||
| data_set = data_set.batch(int(batch_size / line_per_sample), drop_remainder=True) | |||||
| data_set = data_set.map(operations=(lambda x, y, z: (np.array(x).flatten().reshape(batch_size, 39), | |||||
| np.array(y).flatten().reshape(batch_size, 39), | |||||
| np.array(z).flatten().reshape(batch_size, 1))), | |||||
| input_columns=['feat_ids', 'feat_vals', 'label'], | |||||
| column_order=['feat_ids', 'feat_vals', 'label'], | |||||
| num_parallel_workers=8) | |||||
| data_set = data_set.repeat(epochs) | |||||
| return data_set | |||||
| def _get_tf_dataset(directory, train_mode=True, epochs=1, batch_size=1000, | def _get_tf_dataset(directory, train_mode=True, epochs=1, batch_size=1000, | ||||
| @@ -242,28 +242,28 @@ def _get_tf_dataset(directory, train_mode=True, epochs=1, batch_size=1000, | |||||
| for filename in filenames: | for filename in filenames: | ||||
| if file_prefixt_name in filename and 'tfrecord' in filename: | if file_prefixt_name in filename and 'tfrecord' in filename: | ||||
| dataset_files.append(os.path.join(dir_path, filename)) | dataset_files.append(os.path.join(dir_path, filename)) | ||||
| schema = de.Schema() | |||||
| schema = ds.Schema() | |||||
| schema.add_column('feat_ids', de_type=mstype.int32) | schema.add_column('feat_ids', de_type=mstype.int32) | ||||
| schema.add_column('feat_vals', de_type=mstype.float32) | schema.add_column('feat_vals', de_type=mstype.float32) | ||||
| schema.add_column('label', de_type=mstype.float32) | schema.add_column('label', de_type=mstype.float32) | ||||
| if rank_size is not None and rank_id is not None: | if rank_size is not None and rank_id is not None: | ||||
| ds = de.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, | |||||
| schema=schema, num_parallel_workers=8, | |||||
| num_shards=rank_size, shard_id=rank_id, | |||||
| shard_equal_rows=True) | |||||
| data_set = ds.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, | |||||
| schema=schema, num_parallel_workers=8, | |||||
| num_shards=rank_size, shard_id=rank_id, | |||||
| shard_equal_rows=True) | |||||
| else: | else: | ||||
| ds = de.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, | |||||
| schema=schema, num_parallel_workers=8) | |||||
| ds = ds.batch(int(batch_size / line_per_sample), drop_remainder=True) | |||||
| ds = ds.map(operations=(lambda x, y, z: ( | |||||
| data_set = ds.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, | |||||
| schema=schema, num_parallel_workers=8) | |||||
| data_set = data_set.batch(int(batch_size / line_per_sample), drop_remainder=True) | |||||
| data_set = data_set.map(operations=(lambda x, y, z: ( | |||||
| np.array(x).flatten().reshape(batch_size, 39), | np.array(x).flatten().reshape(batch_size, 39), | ||||
| np.array(y).flatten().reshape(batch_size, 39), | np.array(y).flatten().reshape(batch_size, 39), | ||||
| np.array(z).flatten().reshape(batch_size, 1))), | np.array(z).flatten().reshape(batch_size, 1))), | ||||
| input_columns=['feat_ids', 'feat_vals', 'label'], | |||||
| column_order=['feat_ids', 'feat_vals', 'label'], | |||||
| num_parallel_workers=8) | |||||
| ds = ds.repeat(epochs) | |||||
| return ds | |||||
| input_columns=['feat_ids', 'feat_vals', 'label'], | |||||
| column_order=['feat_ids', 'feat_vals', 'label'], | |||||
| num_parallel_workers=8) | |||||
| data_set = data_set.repeat(epochs) | |||||
| return data_set | |||||
| def create_dataset(directory, train_mode=True, epochs=1, batch_size=1000, | def create_dataset(directory, train_mode=True, epochs=1, batch_size=1000, | ||||
| @@ -21,7 +21,7 @@ from enum import Enum | |||||
| import pandas as pd | import pandas as pd | ||||
| import numpy as np | import numpy as np | ||||
| import mindspore.dataset.engine as de | |||||
| import mindspore.dataset as ds | |||||
| import mindspore.common.dtype as mstype | import mindspore.common.dtype as mstype | ||||
| from .config import DataConfig | from .config import DataConfig | ||||
| @@ -142,8 +142,8 @@ class H5Dataset(): | |||||
| X_id = X[:, 0:self.max_length] | X_id = X[:, 0:self.max_length] | ||||
| X_va = X[:, self.max_length:] | X_va = X[:, self.max_length:] | ||||
| yield np.array(X_id.astype(dtype=np.int32)), \ | yield np.array(X_id.astype(dtype=np.int32)), \ | ||||
| np.array(X_va.astype(dtype=np.float32)), \ | |||||
| np.array(y.astype(dtype=np.float32)) | |||||
| np.array(X_va.astype(dtype=np.float32)), \ | |||||
| np.array(y.astype(dtype=np.float32)) | |||||
| def _get_h5_dataset(directory, train_mode=True, epochs=1, batch_size=1000): | def _get_h5_dataset(directory, train_mode=True, epochs=1, batch_size=1000): | ||||
| @@ -172,9 +172,9 @@ def _get_h5_dataset(directory, train_mode=True, epochs=1, batch_size=1000): | |||||
| for _ in range(0, numbers_of_batch, 1): | for _ in range(0, numbers_of_batch, 1): | ||||
| yield train_eval_gen.__next__() | yield train_eval_gen.__next__() | ||||
| ds = de.GeneratorDataset(_iter_h5_data, ["ids", "weights", "labels"], num_samples=3000) | |||||
| ds = ds.repeat(epochs) | |||||
| return ds | |||||
| data_set = ds.GeneratorDataset(_iter_h5_data, ["ids", "weights", "labels"], num_samples=3000) | |||||
| data_set = data_set.repeat(epochs) | |||||
| return data_set | |||||
| def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=1000, | def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=1000, | ||||
| @@ -199,23 +199,23 @@ def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=100 | |||||
| shuffle = train_mode | shuffle = train_mode | ||||
| if rank_size is not None and rank_id is not None: | if rank_size is not None and rank_id is not None: | ||||
| ds = de.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), | |||||
| columns_list=['feat_ids', 'feat_vals', 'label'], | |||||
| num_shards=rank_size, shard_id=rank_id, shuffle=shuffle, | |||||
| num_parallel_workers=8) | |||||
| data_set = ds.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), | |||||
| columns_list=['feat_ids', 'feat_vals', 'label'], | |||||
| num_shards=rank_size, shard_id=rank_id, shuffle=shuffle, | |||||
| num_parallel_workers=8) | |||||
| else: | else: | ||||
| ds = de.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), | |||||
| columns_list=['feat_ids', 'feat_vals', 'label'], | |||||
| shuffle=shuffle, num_parallel_workers=8) | |||||
| ds = ds.batch(int(batch_size / line_per_sample), drop_remainder=True) | |||||
| ds = ds.map(operations=(lambda x, y, z: (np.array(x).flatten().reshape(batch_size, 39), | |||||
| np.array(y).flatten().reshape(batch_size, 39), | |||||
| np.array(z).flatten().reshape(batch_size, 1))), | |||||
| input_columns=['feat_ids', 'feat_vals', 'label'], | |||||
| column_order=['feat_ids', 'feat_vals', 'label'], | |||||
| num_parallel_workers=8) | |||||
| ds = ds.repeat(epochs) | |||||
| return ds | |||||
| data_set = ds.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), | |||||
| columns_list=['feat_ids', 'feat_vals', 'label'], | |||||
| shuffle=shuffle, num_parallel_workers=8) | |||||
| data_set = data_set.batch(int(batch_size / line_per_sample), drop_remainder=True) | |||||
| data_set = data_set.map(operations=(lambda x, y, z: (np.array(x).flatten().reshape(batch_size, 39), | |||||
| np.array(y).flatten().reshape(batch_size, 39), | |||||
| np.array(z).flatten().reshape(batch_size, 1))), | |||||
| input_columns=['feat_ids', 'feat_vals', 'label'], | |||||
| column_order=['feat_ids', 'feat_vals', 'label'], | |||||
| num_parallel_workers=8) | |||||
| data_set = data_set.repeat(epochs) | |||||
| return data_set | |||||
| def _get_tf_dataset(directory, train_mode=True, epochs=1, batch_size=1000, | def _get_tf_dataset(directory, train_mode=True, epochs=1, batch_size=1000, | ||||
| @@ -242,28 +242,28 @@ def _get_tf_dataset(directory, train_mode=True, epochs=1, batch_size=1000, | |||||
| for filename in filenames: | for filename in filenames: | ||||
| if file_prefixt_name in filename and 'tfrecord' in filename: | if file_prefixt_name in filename and 'tfrecord' in filename: | ||||
| dataset_files.append(os.path.join(dir_path, filename)) | dataset_files.append(os.path.join(dir_path, filename)) | ||||
| schema = de.Schema() | |||||
| schema = ds.Schema() | |||||
| schema.add_column('feat_ids', de_type=mstype.int32) | schema.add_column('feat_ids', de_type=mstype.int32) | ||||
| schema.add_column('feat_vals', de_type=mstype.float32) | schema.add_column('feat_vals', de_type=mstype.float32) | ||||
| schema.add_column('label', de_type=mstype.float32) | schema.add_column('label', de_type=mstype.float32) | ||||
| if rank_size is not None and rank_id is not None: | if rank_size is not None and rank_id is not None: | ||||
| ds = de.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, | |||||
| schema=schema, num_parallel_workers=8, | |||||
| num_shards=rank_size, shard_id=rank_id, | |||||
| shard_equal_rows=True, num_samples=3000) | |||||
| data_set = ds.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, | |||||
| schema=schema, num_parallel_workers=8, | |||||
| num_shards=rank_size, shard_id=rank_id, | |||||
| shard_equal_rows=True, num_samples=3000) | |||||
| else: | else: | ||||
| ds = de.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, | |||||
| schema=schema, num_parallel_workers=8, num_samples=3000) | |||||
| ds = ds.batch(int(batch_size / line_per_sample), drop_remainder=True) | |||||
| ds = ds.map(operations=(lambda x, y, z: ( | |||||
| data_set = ds.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, | |||||
| schema=schema, num_parallel_workers=8, num_samples=3000) | |||||
| data_set = data_set.batch(int(batch_size / line_per_sample), drop_remainder=True) | |||||
| data_set = data_set.map(operations=(lambda x, y, z: ( | |||||
| np.array(x).flatten().reshape(batch_size, 39), | np.array(x).flatten().reshape(batch_size, 39), | ||||
| np.array(y).flatten().reshape(batch_size, 39), | np.array(y).flatten().reshape(batch_size, 39), | ||||
| np.array(z).flatten().reshape(batch_size, 1))), | np.array(z).flatten().reshape(batch_size, 1))), | ||||
| input_columns=['feat_ids', 'feat_vals', 'label'], | |||||
| column_order=['feat_ids', 'feat_vals', 'label'], | |||||
| num_parallel_workers=8) | |||||
| ds = ds.repeat(epochs) | |||||
| return ds | |||||
| input_columns=['feat_ids', 'feat_vals', 'label'], | |||||
| column_order=['feat_ids', 'feat_vals', 'label'], | |||||
| num_parallel_workers=8) | |||||
| data_set = data_set.repeat(epochs) | |||||
| return data_set | |||||
| def create_dataset(directory, train_mode=True, epochs=1, batch_size=1000, | def create_dataset(directory, train_mode=True, epochs=1, batch_size=1000, | ||||
| @@ -24,17 +24,18 @@ from mindspore.nn.optim import Adam | |||||
| from mindspore.train.model import Model | from mindspore.train.model import Model | ||||
| from mindspore.train.loss_scale_manager import DynamicLossScaleManager | from mindspore.train.loss_scale_manager import DynamicLossScaleManager | ||||
| from mindspore.train.callback import Callback | from mindspore.train.callback import Callback | ||||
| import mindspore.dataset.engine as de | |||||
| import mindspore.dataset as ds | |||||
| import mindspore.dataset.transforms.c_transforms as deC | import mindspore.dataset.transforms.c_transforms as deC | ||||
| from mindspore import context | from mindspore import context | ||||
| from model_zoo.official.nlp.transformer.src.transformer_model import TransformerConfig | from model_zoo.official.nlp.transformer.src.transformer_model import TransformerConfig | ||||
| from model_zoo.official.nlp.transformer.src.transformer_for_train import TransformerNetworkWithLoss, \ | from model_zoo.official.nlp.transformer.src.transformer_for_train import TransformerNetworkWithLoss, \ | ||||
| TransformerTrainOneStepWithLossScaleCell | |||||
| TransformerTrainOneStepWithLossScaleCell | |||||
| from model_zoo.official.nlp.transformer.src.config import cfg, transformer_net_cfg | from model_zoo.official.nlp.transformer.src.config import cfg, transformer_net_cfg | ||||
| from model_zoo.official.nlp.transformer.src.lr_schedule import create_dynamic_lr | from model_zoo.official.nlp.transformer.src.lr_schedule import create_dynamic_lr | ||||
| DATA_DIR = ["/home/workspace/mindspore_dataset/transformer/test-mindrecord"] | DATA_DIR = ["/home/workspace/mindspore_dataset/transformer/test-mindrecord"] | ||||
| def get_config(version='base', batch_size=1): | def get_config(version='base', batch_size=1): | ||||
| """get config""" | """get config""" | ||||
| if version == 'large': | if version == 'large': | ||||
| @@ -75,23 +76,25 @@ def get_config(version='base', batch_size=1): | |||||
| transformer_cfg = TransformerConfig(batch_size=batch_size) | transformer_cfg = TransformerConfig(batch_size=batch_size) | ||||
| return transformer_cfg | return transformer_cfg | ||||
| def load_test_data(batch_size=1, data_file=None): | def load_test_data(batch_size=1, data_file=None): | ||||
| """Load test dataset.""" | """Load test dataset.""" | ||||
| ds = de.MindDataset(data_file, | |||||
| columns_list=["source_eos_ids", "source_eos_mask", | |||||
| "target_sos_ids", "target_sos_mask", | |||||
| "target_eos_ids", "target_eos_mask"], | |||||
| shuffle=False) | |||||
| data_set = ds.MindDataset(data_file, | |||||
| columns_list=["source_eos_ids", "source_eos_mask", | |||||
| "target_sos_ids", "target_sos_mask", | |||||
| "target_eos_ids", "target_eos_mask"], | |||||
| shuffle=False) | |||||
| type_cast_op = deC.TypeCast(mstype.int32) | type_cast_op = deC.TypeCast(mstype.int32) | ||||
| ds = ds.map(operations=type_cast_op, input_columns="source_eos_ids") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="source_eos_mask") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="target_sos_ids") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="target_sos_mask") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="target_eos_ids") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="target_eos_mask") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="source_eos_ids") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="source_eos_mask") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="target_sos_ids") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="target_sos_mask") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="target_eos_ids") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="target_eos_mask") | |||||
| # apply batch operations | # apply batch operations | ||||
| ds = ds.batch(batch_size, drop_remainder=True) | |||||
| return ds | |||||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||||
| return data_set | |||||
| class ModelCallback(Callback): | class ModelCallback(Callback): | ||||
| def __init__(self): | def __init__(self): | ||||
| @@ -107,13 +110,16 @@ class ModelCallback(Callback): | |||||
| self.lossscale_list.append(cb_params.net_outputs[2].asnumpy()) | self.lossscale_list.append(cb_params.net_outputs[2].asnumpy()) | ||||
| print("epoch: {}, outputs are: {}".format(cb_params.cur_epoch_num, str(cb_params.net_outputs))) | print("epoch: {}, outputs are: {}".format(cb_params.cur_epoch_num, str(cb_params.net_outputs))) | ||||
| class TimeMonitor(Callback): | class TimeMonitor(Callback): | ||||
| """Time Monitor.""" | """Time Monitor.""" | ||||
| def __init__(self, data_size): | def __init__(self, data_size): | ||||
| super(TimeMonitor, self).__init__() | super(TimeMonitor, self).__init__() | ||||
| self.data_size = data_size | self.data_size = data_size | ||||
| self.epoch_mseconds_list = [] | self.epoch_mseconds_list = [] | ||||
| self.per_step_mseconds_list = [] | self.per_step_mseconds_list = [] | ||||
| def epoch_begin(self, run_context): | def epoch_begin(self, run_context): | ||||
| self.epoch_time = time.time() | self.epoch_time = time.time() | ||||
| @@ -122,6 +128,7 @@ class TimeMonitor(Callback): | |||||
| self.epoch_mseconds_list.append(epoch_mseconds) | self.epoch_mseconds_list.append(epoch_mseconds) | ||||
| self.per_step_mseconds_list.append(epoch_mseconds / self.data_size) | self.per_step_mseconds_list.append(epoch_mseconds / self.data_size) | ||||
| @pytest.mark.level0 | @pytest.mark.level0 | ||||
| @pytest.mark.platform_arm_ascend_training | @pytest.mark.platform_arm_ascend_training | ||||
| @pytest.mark.platform_x86_ascend_training | @pytest.mark.platform_x86_ascend_training | ||||
| @@ -142,7 +149,7 @@ def test_transformer(): | |||||
| netwithloss = TransformerNetworkWithLoss(config, True) | netwithloss = TransformerNetworkWithLoss(config, True) | ||||
| lr = Tensor(create_dynamic_lr(schedule="constant*rsqrt_hidden*linear_warmup*rsqrt_decay", | lr = Tensor(create_dynamic_lr(schedule="constant*rsqrt_hidden*linear_warmup*rsqrt_decay", | ||||
| training_steps=dataset.get_dataset_size()*epoch_size, | |||||
| training_steps=dataset.get_dataset_size() * epoch_size, | |||||
| learning_rate=cfg.lr_schedule.learning_rate, | learning_rate=cfg.lr_schedule.learning_rate, | ||||
| warmup_steps=cfg.lr_schedule.warmup_steps, | warmup_steps=cfg.lr_schedule.warmup_steps, | ||||
| hidden_size=config.hidden_size), mstype.float32) | hidden_size=config.hidden_size), mstype.float32) | ||||
| @@ -193,5 +200,6 @@ def test_transformer(): | |||||
| print("per step mseconds: {}".format(per_step_mseconds)) | print("per step mseconds: {}".format(per_step_mseconds)) | ||||
| assert per_step_mseconds <= expect_per_step_mseconds + 2 | assert per_step_mseconds <= expect_per_step_mseconds + 2 | ||||
| if __name__ == '__main__': | if __name__ == '__main__': | ||||
| test_transformer() | test_transformer() | ||||
| @@ -14,13 +14,13 @@ | |||||
| # ============================================================================ | # ============================================================================ | ||||
| """train_imagenet.""" | """train_imagenet.""" | ||||
| import os | import os | ||||
| from enum import Enum | from enum import Enum | ||||
| import numpy as np | import numpy as np | ||||
| import mindspore.dataset.engine as de | |||||
| import mindspore.dataset as ds | |||||
| import mindspore.common.dtype as mstype | import mindspore.common.dtype as mstype | ||||
| class DataType(Enum): | class DataType(Enum): | ||||
| """ | """ | ||||
| Enumerate supported dataset format. | Enumerate supported dataset format. | ||||
| @@ -29,6 +29,7 @@ class DataType(Enum): | |||||
| TFRECORD = 2 | TFRECORD = 2 | ||||
| H5 = 3 | H5 = 3 | ||||
| def _get_tf_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000, | def _get_tf_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000, | ||||
| line_per_sample=1000, rank_size=None, rank_id=None): | line_per_sample=1000, rank_size=None, rank_id=None): | ||||
| """ | """ | ||||
| @@ -41,26 +42,29 @@ def _get_tf_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000, | |||||
| for filename in filenames: | for filename in filenames: | ||||
| if file_prefix_name in filename and "tfrecord" in filename: | if file_prefix_name in filename and "tfrecord" in filename: | ||||
| dataset_files.append(os.path.join(dirpath, filename)) | dataset_files.append(os.path.join(dirpath, filename)) | ||||
| schema = de.Schema() | |||||
| schema = ds.Schema() | |||||
| schema.add_column('feat_ids', de_type=mstype.int32) | schema.add_column('feat_ids', de_type=mstype.int32) | ||||
| schema.add_column('feat_vals', de_type=mstype.float32) | schema.add_column('feat_vals', de_type=mstype.float32) | ||||
| schema.add_column('label', de_type=mstype.float32) | schema.add_column('label', de_type=mstype.float32) | ||||
| if rank_size is not None and rank_id is not None: | if rank_size is not None and rank_id is not None: | ||||
| ds = de.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, schema=schema, num_parallel_workers=8, | |||||
| num_shards=rank_size, shard_id=rank_id, shard_equal_rows=True) | |||||
| data_set = ds.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, schema=schema, | |||||
| num_parallel_workers=8, | |||||
| num_shards=rank_size, shard_id=rank_id, shard_equal_rows=True) | |||||
| else: | else: | ||||
| ds = de.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, schema=schema, num_parallel_workers=8) | |||||
| ds = ds.batch(int(batch_size / line_per_sample), | |||||
| drop_remainder=True) | |||||
| ds = ds.map(operations=(lambda x, y, z: ( | |||||
| data_set = ds.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, schema=schema, | |||||
| num_parallel_workers=8) | |||||
| data_set = data_set.batch(int(batch_size / line_per_sample), | |||||
| drop_remainder=True) | |||||
| data_set = data_set.map(operations=(lambda x, y, z: ( | |||||
| np.array(x).flatten().reshape(batch_size, 39), | np.array(x).flatten().reshape(batch_size, 39), | ||||
| np.array(y).flatten().reshape(batch_size, 39), | np.array(y).flatten().reshape(batch_size, 39), | ||||
| np.array(z).flatten().reshape(batch_size, 1))), | np.array(z).flatten().reshape(batch_size, 1))), | ||||
| input_columns=['feat_ids', 'feat_vals', 'label'], | |||||
| column_order=['feat_ids', 'feat_vals', 'label'], num_parallel_workers=8) | |||||
| #if train_mode: | |||||
| ds = ds.repeat(epochs) | |||||
| return ds | |||||
| input_columns=['feat_ids', 'feat_vals', 'label'], | |||||
| column_order=['feat_ids', 'feat_vals', 'label'], num_parallel_workers=8) | |||||
| # if train_mode: | |||||
| data_set = data_set.repeat(epochs) | |||||
| return data_set | |||||
| def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=1000, | def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=1000, | ||||
| line_per_sample=1000, rank_size=None, rank_id=None): | line_per_sample=1000, rank_size=None, rank_id=None): | ||||
| @@ -84,23 +88,23 @@ def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=100 | |||||
| shuffle = train_mode | shuffle = train_mode | ||||
| if rank_size is not None and rank_id is not None: | if rank_size is not None and rank_id is not None: | ||||
| ds = de.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), | |||||
| columns_list=['feat_ids', 'feat_vals', 'label'], | |||||
| num_shards=rank_size, shard_id=rank_id, shuffle=shuffle, | |||||
| num_parallel_workers=8) | |||||
| data_set = ds.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), | |||||
| columns_list=['feat_ids', 'feat_vals', 'label'], | |||||
| num_shards=rank_size, shard_id=rank_id, shuffle=shuffle, | |||||
| num_parallel_workers=8) | |||||
| else: | else: | ||||
| ds = de.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), | |||||
| columns_list=['feat_ids', 'feat_vals', 'label'], | |||||
| shuffle=shuffle, num_parallel_workers=8) | |||||
| ds = ds.batch(int(batch_size / line_per_sample), drop_remainder=True) | |||||
| ds = ds.map(operations=(lambda x, y, z: (np.array(x).flatten().reshape(batch_size, 39), | |||||
| np.array(y).flatten().reshape(batch_size, 39), | |||||
| np.array(z).flatten().reshape(batch_size, 1))), | |||||
| input_columns=['feat_ids', 'feat_vals', 'label'], | |||||
| column_order=['feat_ids', 'feat_vals', 'label'], | |||||
| num_parallel_workers=8) | |||||
| ds = ds.repeat(epochs) | |||||
| return ds | |||||
| data_set = ds.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), | |||||
| columns_list=['feat_ids', 'feat_vals', 'label'], | |||||
| shuffle=shuffle, num_parallel_workers=8) | |||||
| data_set = data_set.batch(int(batch_size / line_per_sample), drop_remainder=True) | |||||
| data_set = data_set.map(operations=(lambda x, y, z: (np.array(x).flatten().reshape(batch_size, 39), | |||||
| np.array(y).flatten().reshape(batch_size, 39), | |||||
| np.array(z).flatten().reshape(batch_size, 1))), | |||||
| input_columns=['feat_ids', 'feat_vals', 'label'], | |||||
| column_order=['feat_ids', 'feat_vals', 'label'], | |||||
| num_parallel_workers=8) | |||||
| data_set = data_set.repeat(epochs) | |||||
| return data_set | |||||
| def create_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000, | def create_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000, | ||||
| @@ -20,7 +20,7 @@ import time | |||||
| import numpy as np | import numpy as np | ||||
| import pytest | import pytest | ||||
| import mindspore.common.dtype as mstype | import mindspore.common.dtype as mstype | ||||
| import mindspore.dataset.engine.datasets as de | |||||
| import mindspore.dataset as ds | |||||
| import mindspore.dataset.transforms.c_transforms as C | import mindspore.dataset.transforms.c_transforms as C | ||||
| from mindspore import context | from mindspore import context | ||||
| from mindspore import log as logger | from mindspore import log as logger | ||||
| @@ -35,7 +35,6 @@ from model_zoo.official.nlp.bert.src.bert_for_pre_training import BertNetworkWit | |||||
| from model_zoo.official.nlp.bert.src.bert_for_pre_training import BertTrainOneStepWithLossScaleCell | from model_zoo.official.nlp.bert.src.bert_for_pre_training import BertTrainOneStepWithLossScaleCell | ||||
| from model_zoo.official.nlp.bert.src.bert_model import BertConfig | from model_zoo.official.nlp.bert.src.bert_model import BertConfig | ||||
| _current_dir = os.path.dirname(os.path.realpath(__file__)) | _current_dir = os.path.dirname(os.path.realpath(__file__)) | ||||
| DATA_DIR = ["/home/workspace/mindspore_dataset/bert/example/examples.tfrecord"] | DATA_DIR = ["/home/workspace/mindspore_dataset/bert/example/examples.tfrecord"] | ||||
| SCHEMA_DIR = "/home/workspace/mindspore_dataset/bert/example/datasetSchema.json" | SCHEMA_DIR = "/home/workspace/mindspore_dataset/bert/example/datasetSchema.json" | ||||
| @@ -88,25 +87,26 @@ def me_de_train_dataset(sink_mode=False): | |||||
| repeat_count = 1 | repeat_count = 1 | ||||
| sink_size = -1 | sink_size = -1 | ||||
| batch_size = 16 | batch_size = 16 | ||||
| ds = de.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["input_ids", "input_mask", "segment_ids", | |||||
| "next_sentence_labels", "masked_lm_positions", | |||||
| "masked_lm_ids", "masked_lm_weights"], shuffle=False) | |||||
| data_set = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["input_ids", "input_mask", "segment_ids", | |||||
| "next_sentence_labels", "masked_lm_positions", | |||||
| "masked_lm_ids", "masked_lm_weights"], | |||||
| shuffle=False) | |||||
| type_cast_op = C.TypeCast(mstype.int32) | type_cast_op = C.TypeCast(mstype.int32) | ||||
| new_repeat_count = repeat_count | new_repeat_count = repeat_count | ||||
| if sink_mode: | if sink_mode: | ||||
| sink_size = 100 | sink_size = 100 | ||||
| new_repeat_count = 3 | new_repeat_count = 3 | ||||
| ds = ds.map(operations=type_cast_op, input_columns="masked_lm_ids") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="masked_lm_positions") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="next_sentence_labels") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="segment_ids") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="input_mask") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="input_ids") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_ids") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_positions") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="next_sentence_labels") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="input_mask") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="input_ids") | |||||
| # apply batch operations | # apply batch operations | ||||
| ds = ds.batch(batch_size, drop_remainder=True) | |||||
| logger.info("data size: {}".format(ds.get_dataset_size())) | |||||
| logger.info("repeat_count: {}".format(ds.get_repeat_count())) | |||||
| return ds, new_repeat_count, sink_size | |||||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||||
| logger.info("data size: {}".format(data_set.get_dataset_size())) | |||||
| logger.info("repeat_count: {}".format(data_set.get_repeat_count())) | |||||
| return data_set, new_repeat_count, sink_size | |||||
| def weight_variable(shape): | def weight_variable(shape): | ||||
| @@ -155,13 +155,16 @@ class ModelCallback(Callback): | |||||
| self.lossscale_list.append(cb_params.net_outputs[2].asnumpy()) | self.lossscale_list.append(cb_params.net_outputs[2].asnumpy()) | ||||
| print("epoch: {}, outputs are: {}".format(cb_params.cur_epoch_num, str(cb_params.net_outputs))) | print("epoch: {}, outputs are: {}".format(cb_params.cur_epoch_num, str(cb_params.net_outputs))) | ||||
| class TimeMonitor(Callback): | class TimeMonitor(Callback): | ||||
| """Time Monitor.""" | """Time Monitor.""" | ||||
| def __init__(self, data_size): | def __init__(self, data_size): | ||||
| super(TimeMonitor, self).__init__() | super(TimeMonitor, self).__init__() | ||||
| self.data_size = data_size | self.data_size = data_size | ||||
| self.epoch_mseconds_list = [] | self.epoch_mseconds_list = [] | ||||
| self.per_step_mseconds_list = [] | self.per_step_mseconds_list = [] | ||||
| def epoch_begin(self, run_context): | def epoch_begin(self, run_context): | ||||
| self.epoch_time = time.time() | self.epoch_time = time.time() | ||||
| @@ -178,7 +181,7 @@ class TimeMonitor(Callback): | |||||
| def test_bert_performance(): | def test_bert_performance(): | ||||
| """test bert performance""" | """test bert performance""" | ||||
| context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", reserve_class_name_in_scope=False) | context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", reserve_class_name_in_scope=False) | ||||
| ds, new_repeat_count, sink_size = me_de_train_dataset(sink_mode=True) | |||||
| data_set, new_repeat_count, sink_size = me_de_train_dataset(sink_mode=True) | |||||
| version = os.getenv('VERSION', 'large') | version = os.getenv('VERSION', 'large') | ||||
| config = get_config(version=version) | config = get_config(version=version) | ||||
| netwithloss = BertNetworkWithLoss(config, True) | netwithloss = BertNetworkWithLoss(config, True) | ||||
| @@ -221,7 +224,7 @@ def test_bert_performance(): | |||||
| logger.info("***************** BERT param name is 3 {}".format(name)) | logger.info("***************** BERT param name is 3 {}".format(name)) | ||||
| param.set_data(weight_variable(value.asnumpy().shape)) | param.set_data(weight_variable(value.asnumpy().shape)) | ||||
| time_monitor_callback = TimeMonitor(sink_size) | time_monitor_callback = TimeMonitor(sink_size) | ||||
| model.train(new_repeat_count, ds, callbacks=[time_monitor_callback, callback], | |||||
| model.train(new_repeat_count, data_set, callbacks=[time_monitor_callback, callback], | |||||
| dataset_sink_mode=True, sink_size=sink_size) | dataset_sink_mode=True, sink_size=sink_size) | ||||
| # assertion occurs while the loss value, overflow state or loss_scale value is wrong | # assertion occurs while the loss value, overflow state or loss_scale value is wrong | ||||
| @@ -250,5 +253,6 @@ def test_bert_performance(): | |||||
| print("per step mseconds: {}".format(per_step_mseconds)) | print("per step mseconds: {}".format(per_step_mseconds)) | ||||
| assert per_step_mseconds <= expect_per_step_mseconds + 1 | assert per_step_mseconds <= expect_per_step_mseconds + 1 | ||||
| if __name__ == '__main__': | if __name__ == '__main__': | ||||
| test_bert_performance() | test_bert_performance() | ||||
| @@ -20,7 +20,7 @@ import time | |||||
| from multiprocessing import Process, Queue | from multiprocessing import Process, Queue | ||||
| import pytest | import pytest | ||||
| import numpy as np | import numpy as np | ||||
| import mindspore.dataset as dataset | |||||
| import mindspore.dataset as ds | |||||
| import mindspore.common.dtype as mstype | import mindspore.common.dtype as mstype | ||||
| import mindspore.communication.management as D | import mindspore.communication.management as D | ||||
| from mindspore import context | from mindspore import context | ||||
| @@ -28,7 +28,6 @@ from mindspore import log as logger | |||||
| from mindspore.train.callback import Callback | from mindspore.train.callback import Callback | ||||
| from mindspore.context import ParallelMode | from mindspore.context import ParallelMode | ||||
| from mindspore.train.serialization import load_checkpoint, load_param_into_net | from mindspore.train.serialization import load_checkpoint, load_param_into_net | ||||
| import mindspore.dataset.engine.datasets as de | |||||
| import mindspore.dataset.transforms.c_transforms as C | import mindspore.dataset.transforms.c_transforms as C | ||||
| from model_zoo.official.nlp.bert_thor.src.bert_for_pre_training import BertNetworkWithLoss, BertTrainOneStepCell | from model_zoo.official.nlp.bert_thor.src.bert_for_pre_training import BertNetworkWithLoss, BertTrainOneStepCell | ||||
| from model_zoo.official.nlp.bert_thor.src.bert_net_config import bert_net_cfg | from model_zoo.official.nlp.bert_thor.src.bert_net_config import bert_net_cfg | ||||
| @@ -45,11 +44,13 @@ train_steps = 200 | |||||
| batch_size = 12 | batch_size = 12 | ||||
| np.random.seed(1) | np.random.seed(1) | ||||
| dataset.config.set_seed(1) | |||||
| ds.config.set_seed(1) | |||||
| os.environ['GLOG_v'] = str(2) | os.environ['GLOG_v'] = str(2) | ||||
| class TimeMonitor(Callback): | class TimeMonitor(Callback): | ||||
| """Time Monitor.""" | """Time Monitor.""" | ||||
| def __init__(self, data_size): | def __init__(self, data_size): | ||||
| super(TimeMonitor, self).__init__() | super(TimeMonitor, self).__init__() | ||||
| self.data_size = data_size | self.data_size = data_size | ||||
| @@ -67,6 +68,7 @@ class TimeMonitor(Callback): | |||||
| self.per_step_mseconds_list.append(per_step_mseconds) | self.per_step_mseconds_list.append(per_step_mseconds) | ||||
| print("epoch: {}, per_step_mseconds are {}".format(cb_params.cur_epoch_num, str(per_step_mseconds)), flush=True) | print("epoch: {}, per_step_mseconds are {}".format(cb_params.cur_epoch_num, str(per_step_mseconds)), flush=True) | ||||
| class LossCallback(Callback): | class LossCallback(Callback): | ||||
| def __init__(self): | def __init__(self): | ||||
| super(LossCallback, self).__init__() | super(LossCallback, self).__init__() | ||||
| @@ -78,6 +80,7 @@ class LossCallback(Callback): | |||||
| print("epoch: {}, step: {}, outputs are {}".format(cb_params.cur_epoch_num, cb_params.cur_step_num, | print("epoch: {}, step: {}, outputs are {}".format(cb_params.cur_epoch_num, cb_params.cur_step_num, | ||||
| str(cb_params.net_outputs)), flush=True) | str(cb_params.net_outputs)), flush=True) | ||||
| def create_bert_dataset(device_num=1, rank=0, do_shuffle="true", data_dir=None, schema_dir=None): | def create_bert_dataset(device_num=1, rank=0, do_shuffle="true", data_dir=None, schema_dir=None): | ||||
| """create train dataset""" | """create train dataset""" | ||||
| # apply repeat operations | # apply repeat operations | ||||
| @@ -87,25 +90,25 @@ def create_bert_dataset(device_num=1, rank=0, do_shuffle="true", data_dir=None, | |||||
| if "tfrecord" in file_name: | if "tfrecord" in file_name: | ||||
| data_files.append(os.path.join(data_dir, file_name)) | data_files.append(os.path.join(data_dir, file_name)) | ||||
| data_files = sorted(data_files) | data_files = sorted(data_files) | ||||
| ds = de.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None, | |||||
| columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", | |||||
| "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"], | |||||
| shuffle=de.Shuffle.FILES if do_shuffle == "true" else False, | |||||
| num_shards=device_num, shard_id=rank, shard_equal_rows=True) | |||||
| ori_dataset_size = ds.get_dataset_size() | |||||
| data_set = ds.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None, | |||||
| columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", | |||||
| "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"], | |||||
| shuffle=ds.Shuffle.FILES if do_shuffle == "true" else False, | |||||
| num_shards=device_num, shard_id=rank, shard_equal_rows=True) | |||||
| ori_dataset_size = data_set.get_dataset_size() | |||||
| print('origin dataset size: ', ori_dataset_size) | print('origin dataset size: ', ori_dataset_size) | ||||
| type_cast_op = C.TypeCast(mstype.int32) | type_cast_op = C.TypeCast(mstype.int32) | ||||
| ds = ds.map(operations=type_cast_op, input_columns="masked_lm_ids") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="masked_lm_positions") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="next_sentence_labels") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="segment_ids") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="input_mask") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="input_ids") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_ids") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_positions") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="next_sentence_labels") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="input_mask") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="input_ids") | |||||
| # apply batch operations | # apply batch operations | ||||
| ds = ds.batch(batch_size, drop_remainder=True) | |||||
| logger.info("data size: {}".format(ds.get_dataset_size())) | |||||
| logger.info("repeat count: {}".format(ds.get_repeat_count())) | |||||
| return ds | |||||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||||
| logger.info("data size: {}".format(data_set.get_dataset_size())) | |||||
| logger.info("repeat count: {}".format(data_set.get_repeat_count())) | |||||
| return data_set | |||||
| def _set_bert_all_reduce_split(): | def _set_bert_all_reduce_split(): | ||||
| @@ -151,13 +154,13 @@ def train_process_bert_thor(q, device_id, epoch_size, device_num): | |||||
| device_num=device_num) | device_num=device_num) | ||||
| bert_net_cfg.num_hidden_layers = 4 | bert_net_cfg.num_hidden_layers = 4 | ||||
| ds = create_bert_dataset(device_num=device_num, rank=rank, do_shuffle=False, data_dir=DATASET_PATH, schema_dir=None) | |||||
| data_set = create_bert_dataset(device_num=device_num, rank=rank, do_shuffle=False, data_dir=DATASET_PATH, | |||||
| schema_dir=None) | |||||
| net_with_loss = BertNetworkWithLoss(bert_net_cfg, True) | net_with_loss = BertNetworkWithLoss(bert_net_cfg, True) | ||||
| new_repeat_count = epoch_size * ds.get_dataset_size() // data_sink_steps | |||||
| new_repeat_count = epoch_size * data_set.get_dataset_size() // data_sink_steps | |||||
| new_repeat_count = min(new_repeat_count, train_steps // data_sink_steps) | new_repeat_count = min(new_repeat_count, train_steps // data_sink_steps) | ||||
| lr = get_bert_lr() | lr = get_bert_lr() | ||||
| damping = get_bert_damping() | damping = get_bert_damping() | ||||
| optimizer = THOR(filter(lambda x: x.requires_grad, net_with_loss.get_parameters()), lr, cfg.Thor.momentum, | optimizer = THOR(filter(lambda x: x.requires_grad, net_with_loss.get_parameters()), lr, cfg.Thor.momentum, | ||||
| @@ -175,7 +178,7 @@ def train_process_bert_thor(q, device_id, epoch_size, device_num): | |||||
| net_with_grads = BertTrainOneStepCell(net_with_loss, optimizer=optimizer) | net_with_grads = BertTrainOneStepCell(net_with_loss, optimizer=optimizer) | ||||
| model = Model(net_with_grads, frequency=cfg.Thor.frequency) | model = Model(net_with_grads, frequency=cfg.Thor.frequency) | ||||
| model.train(new_repeat_count, ds, callbacks=callback, dataset_sink_mode=True, sink_size=data_sink_steps) | |||||
| model.train(new_repeat_count, data_set, callbacks=callback, dataset_sink_mode=True, sink_size=data_sink_steps) | |||||
| loss_list = loss_callback.loss_list | loss_list = loss_callback.loss_list | ||||
| per_step_mseconds = time_monitor_callback.per_step_mseconds_list | per_step_mseconds = time_monitor_callback.per_step_mseconds_list | ||||
| @@ -230,5 +233,6 @@ def test_bert_thor_mlperf_8p(): | |||||
| assert mean_cost < 64.2 | assert mean_cost < 64.2 | ||||
| assert mean_loss < 7.9 | assert mean_loss < 7.9 | ||||
| if __name__ == '__main__': | if __name__ == '__main__': | ||||
| test_bert_thor_mlperf_8p() | test_bert_thor_mlperf_8p() | ||||
| @@ -20,7 +20,7 @@ import time | |||||
| import numpy as np | import numpy as np | ||||
| import pytest | import pytest | ||||
| import mindspore.common.dtype as mstype | import mindspore.common.dtype as mstype | ||||
| import mindspore.dataset.engine.datasets as de | |||||
| import mindspore.dataset as ds | |||||
| import mindspore.dataset.transforms.c_transforms as C | import mindspore.dataset.transforms.c_transforms as C | ||||
| from mindspore import context | from mindspore import context | ||||
| from mindspore import log as logger | from mindspore import log as logger | ||||
| @@ -87,25 +87,26 @@ def me_de_train_dataset(sink_mode=False): | |||||
| repeat_count = 1 | repeat_count = 1 | ||||
| sink_size = -1 | sink_size = -1 | ||||
| batch_size = 16 | batch_size = 16 | ||||
| ds = de.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["input_ids", "input_mask", "segment_ids", | |||||
| "next_sentence_labels", "masked_lm_positions", | |||||
| "masked_lm_ids", "masked_lm_weights"], shuffle=False) | |||||
| data_set = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["input_ids", "input_mask", "segment_ids", | |||||
| "next_sentence_labels", "masked_lm_positions", | |||||
| "masked_lm_ids", "masked_lm_weights"], | |||||
| shuffle=False) | |||||
| type_cast_op = C.TypeCast(mstype.int32) | type_cast_op = C.TypeCast(mstype.int32) | ||||
| new_repeat_count = repeat_count | new_repeat_count = repeat_count | ||||
| if sink_mode: | if sink_mode: | ||||
| sink_size = 100 | sink_size = 100 | ||||
| new_repeat_count = 3 | new_repeat_count = 3 | ||||
| ds = ds.map(operations=type_cast_op, input_columns="masked_lm_ids") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="masked_lm_positions") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="next_sentence_labels") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="segment_ids") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="input_mask") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="input_ids") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_ids") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_positions") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="next_sentence_labels") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="input_mask") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="input_ids") | |||||
| # apply batch operations | # apply batch operations | ||||
| ds = ds.batch(batch_size, drop_remainder=True) | |||||
| logger.info("data size: {}".format(ds.get_dataset_size())) | |||||
| logger.info("repeat_count: {}".format(ds.get_repeat_count())) | |||||
| return ds, new_repeat_count, sink_size | |||||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||||
| logger.info("data size: {}".format(data_set.get_dataset_size())) | |||||
| logger.info("repeat_count: {}".format(data_set.get_repeat_count())) | |||||
| return data_set, new_repeat_count, sink_size | |||||
| def weight_variable(shape): | def weight_variable(shape): | ||||
| @@ -178,11 +179,11 @@ def test_bert_percision(enable_graph_kernel=False): | |||||
| context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", reserve_class_name_in_scope=False) | context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", reserve_class_name_in_scope=False) | ||||
| if enable_graph_kernel: | if enable_graph_kernel: | ||||
| context.set_context(enable_graph_kernel=True) | context.set_context(enable_graph_kernel=True) | ||||
| ds, new_repeat_count, _ = me_de_train_dataset() | |||||
| data_set, new_repeat_count, _ = me_de_train_dataset() | |||||
| version = os.getenv('VERSION', 'large') | version = os.getenv('VERSION', 'large') | ||||
| config = get_config(version=version) | config = get_config(version=version) | ||||
| netwithloss = BertNetworkWithLoss(config, True) | netwithloss = BertNetworkWithLoss(config, True) | ||||
| lr = BertLearningRate(decay_steps=ds.get_dataset_size() * new_repeat_count, | |||||
| lr = BertLearningRate(decay_steps=data_set.get_dataset_size() * new_repeat_count, | |||||
| learning_rate=5e-5, end_learning_rate=1e-9, | learning_rate=5e-5, end_learning_rate=1e-9, | ||||
| power=10.0, warmup_steps=0) | power=10.0, warmup_steps=0) | ||||
| decay_filter = lambda x: 'layernorm' not in x.name.lower() and 'bias' not in x.name.lower() | decay_filter = lambda x: 'layernorm' not in x.name.lower() and 'bias' not in x.name.lower() | ||||
| @@ -218,7 +219,7 @@ def test_bert_percision(enable_graph_kernel=False): | |||||
| else: | else: | ||||
| logger.info("***************** BERT param name is 3 {}".format(name)) | logger.info("***************** BERT param name is 3 {}".format(name)) | ||||
| param.set_data(weight_variable(value.asnumpy().shape)) | param.set_data(weight_variable(value.asnumpy().shape)) | ||||
| model.train(new_repeat_count, ds, callbacks=callback, dataset_sink_mode=False) | |||||
| model.train(new_repeat_count, data_set, callbacks=callback, dataset_sink_mode=False) | |||||
| # assertion occurs while the loss value, overflow state or loss_scale value is wrong | # assertion occurs while the loss value, overflow state or loss_scale value is wrong | ||||
| loss_value = np.array(callback.loss_list) | loss_value = np.array(callback.loss_list) | ||||
| @@ -17,7 +17,7 @@ Data operations, will be used in run_pretrain.py | |||||
| """ | """ | ||||
| import os | import os | ||||
| import mindspore.common.dtype as mstype | import mindspore.common.dtype as mstype | ||||
| import mindspore.dataset.engine.datasets as de | |||||
| import mindspore.dataset as ds | |||||
| import mindspore.dataset.transforms.c_transforms as C | import mindspore.dataset.transforms.c_transforms as C | ||||
| from mindspore import log as logger | from mindspore import log as logger | ||||
| from .config import bert_net_cfg | from .config import bert_net_cfg | ||||
| @@ -32,24 +32,24 @@ def create_bert_dataset(epoch_size=1, device_num=1, rank=0, do_shuffle="true", d | |||||
| for file_name in files: | for file_name in files: | ||||
| if "tfrecord" in file_name: | if "tfrecord" in file_name: | ||||
| data_files.append(os.path.join(data_dir, file_name)) | data_files.append(os.path.join(data_dir, file_name)) | ||||
| ds = de.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None, | |||||
| columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", | |||||
| "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"], | |||||
| shuffle=(do_shuffle == "true"), num_shards=device_num, shard_id=rank, | |||||
| shard_equal_rows=True) | |||||
| ori_dataset_size = ds.get_dataset_size() | |||||
| data_set = ds.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None, | |||||
| columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", | |||||
| "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"], | |||||
| shuffle=(do_shuffle == "true"), num_shards=device_num, shard_id=rank, | |||||
| shard_equal_rows=True) | |||||
| ori_dataset_size = data_set.get_dataset_size() | |||||
| print('origin dataset size: ', ori_dataset_size) | print('origin dataset size: ', ori_dataset_size) | ||||
| new_repeat_count = int(repeat_count * ori_dataset_size // ds.get_dataset_size()) | |||||
| new_repeat_count = int(repeat_count * ori_dataset_size // data_set.get_dataset_size()) | |||||
| type_cast_op = C.TypeCast(mstype.int32) | type_cast_op = C.TypeCast(mstype.int32) | ||||
| ds = ds.map(operations=type_cast_op, input_columns="masked_lm_ids") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="masked_lm_positions") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="next_sentence_labels") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="segment_ids") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="input_mask") | |||||
| ds = ds.map(operations=type_cast_op, input_columns="input_ids") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_ids") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_positions") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="next_sentence_labels") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="input_mask") | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="input_ids") | |||||
| # apply batch operations | # apply batch operations | ||||
| ds = ds.batch(bert_net_cfg.batch_size, drop_remainder=True) | |||||
| ds = ds.repeat(max(new_repeat_count, repeat_count)) | |||||
| logger.info("data size: {}".format(ds.get_dataset_size())) | |||||
| logger.info("repeatcount: {}".format(ds.get_repeat_count())) | |||||
| return ds, new_repeat_count | |||||
| data_set = data_set.batch(bert_net_cfg.batch_size, drop_remainder=True) | |||||
| data_set = data_set.repeat(max(new_repeat_count, repeat_count)) | |||||
| logger.info("data size: {}".format(data_set.get_dataset_size())) | |||||
| logger.info("repeatcount: {}".format(data_set.get_repeat_count())) | |||||
| return data_set, new_repeat_count | |||||
| @@ -17,7 +17,7 @@ | |||||
| import os | import os | ||||
| import mindspore.common.dtype as mstype | import mindspore.common.dtype as mstype | ||||
| import mindspore.dataset.engine as de | |||||
| import mindspore.dataset as ds | |||||
| import mindspore.dataset.vision.c_transforms as C | import mindspore.dataset.vision.c_transforms as C | ||||
| import mindspore.dataset.transforms.c_transforms as C2 | import mindspore.dataset.transforms.c_transforms as C2 | ||||
| @@ -39,10 +39,10 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32): | |||||
| device_num = int(os.getenv("RANK_SIZE")) | device_num = int(os.getenv("RANK_SIZE")) | ||||
| rank_id = int(os.getenv("RANK_ID")) | rank_id = int(os.getenv("RANK_ID")) | ||||
| if device_num == 1: | if device_num == 1: | ||||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||||
| else: | else: | ||||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||||
| num_shards=device_num, shard_id=rank_id) | |||||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||||
| num_shards=device_num, shard_id=rank_id) | |||||
| image_size = 224 | image_size = 224 | ||||
| mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] | mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] | ||||
| @@ -65,15 +65,14 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32): | |||||
| C.HWC2CHW() | C.HWC2CHW() | ||||
| ] | ] | ||||
| type_cast_op = C2.TypeCast(mstype.int32) | type_cast_op = C2.TypeCast(mstype.int32) | ||||
| ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8) | |||||
| ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) | |||||
| data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8) | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) | |||||
| # apply batch operations | # apply batch operations | ||||
| ds = ds.batch(batch_size, drop_remainder=True) | |||||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||||
| # apply dataset repeat operation | # apply dataset repeat operation | ||||
| ds = ds.repeat(repeat_num) | |||||
| return ds | |||||
| data_set = data_set.repeat(repeat_num) | |||||
| return data_set | |||||
| @@ -18,12 +18,11 @@ | |||||
| import os | import os | ||||
| import mindspore.common.dtype as mstype | import mindspore.common.dtype as mstype | ||||
| import mindspore.dataset as dataset | |||||
| import mindspore.dataset.engine as de | |||||
| import mindspore.dataset as ds | |||||
| import mindspore.dataset.transforms.c_transforms as C2 | import mindspore.dataset.transforms.c_transforms as C2 | ||||
| import mindspore.dataset.vision.c_transforms as C | import mindspore.dataset.vision.c_transforms as C | ||||
| dataset.config.set_seed(1) | |||||
| ds.config.set_seed(1) | |||||
| def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32): | def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32): | ||||
| @@ -43,10 +42,10 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32): | |||||
| device_num = int(os.getenv("RANK_SIZE")) | device_num = int(os.getenv("RANK_SIZE")) | ||||
| rank_id = int(os.getenv("RANK_ID")) | rank_id = int(os.getenv("RANK_ID")) | ||||
| if device_num == 1: | if device_num == 1: | ||||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||||
| else: | else: | ||||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||||
| num_shards=device_num, shard_id=rank_id) | |||||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||||
| num_shards=device_num, shard_id=rank_id) | |||||
| image_size = 224 | image_size = 224 | ||||
| mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] | mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] | ||||
| @@ -71,12 +70,12 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32): | |||||
| type_cast_op = C2.TypeCast(mstype.int32) | type_cast_op = C2.TypeCast(mstype.int32) | ||||
| ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8) | |||||
| ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) | |||||
| data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8) | |||||
| data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) | |||||
| # apply batch operations | # apply batch operations | ||||
| ds = ds.batch(batch_size, drop_remainder=True) | |||||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||||
| # apply dataset repeat operation | # apply dataset repeat operation | ||||
| ds = ds.repeat(repeat_num) | |||||
| return ds | |||||
| data_set = data_set.repeat(repeat_num) | |||||
| return data_set | |||||
| @@ -14,11 +14,10 @@ | |||||
| # ============================================================================ | # ============================================================================ | ||||
| """ create train dataset. """ | """ create train dataset. """ | ||||
| from functools import partial | from functools import partial | ||||
| import mindspore.common.dtype as mstype | import mindspore.common.dtype as mstype | ||||
| import mindspore.dataset.engine as de | |||||
| import mindspore.dataset as ds | |||||
| import mindspore.dataset.transforms.c_transforms as C2 | import mindspore.dataset.transforms.c_transforms as C2 | ||||
| import mindspore.dataset.vision.c_transforms as C | import mindspore.dataset.vision.c_transforms as C | ||||
| @@ -37,8 +36,8 @@ def create_dataset(dataset_path, config, repeat_num=1, batch_size=32): | |||||
| dataset | dataset | ||||
| """ | """ | ||||
| load_func = partial(de.Cifar10Dataset, dataset_path) | |||||
| ds = load_func(num_parallel_workers=8, shuffle=False) | |||||
| load_func = partial(ds.Cifar10Dataset, dataset_path) | |||||
| data_set = load_func(num_parallel_workers=8, shuffle=False) | |||||
| resize_height = config.image_height | resize_height = config.image_height | ||||
| resize_width = config.image_width | resize_width = config.image_width | ||||
| @@ -54,15 +53,15 @@ def create_dataset(dataset_path, config, repeat_num=1, batch_size=32): | |||||
| type_cast_op = C2.TypeCast(mstype.int32) | type_cast_op = C2.TypeCast(mstype.int32) | ||||
| ds = ds.map(operations=c_trans, input_columns="image", | |||||
| num_parallel_workers=8) | |||||
| ds = ds.map(operations=type_cast_op, | |||||
| input_columns="label", num_parallel_workers=8) | |||||
| data_set = data_set.map(operations=c_trans, input_columns="image", | |||||
| num_parallel_workers=8) | |||||
| data_set = data_set.map(operations=type_cast_op, | |||||
| input_columns="label", num_parallel_workers=8) | |||||
| # apply batch operations | # apply batch operations | ||||
| ds = ds.batch(batch_size, drop_remainder=True) | |||||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||||
| # apply dataset repeat operation | # apply dataset repeat operation | ||||
| ds = ds.repeat(repeat_num) | |||||
| data_set = data_set.repeat(repeat_num) | |||||
| return ds | |||||
| return data_set | |||||
| @@ -16,7 +16,7 @@ | |||||
| Testing AutoContrast op in DE | Testing AutoContrast op in DE | ||||
| """ | """ | ||||
| import numpy as np | import numpy as np | ||||
| import mindspore.dataset.engine as de | |||||
| import mindspore.dataset as ds | |||||
| import mindspore.dataset.transforms.py_transforms | import mindspore.dataset.transforms.py_transforms | ||||
| import mindspore.dataset.vision.py_transforms as F | import mindspore.dataset.vision.py_transforms as F | ||||
| import mindspore.dataset.vision.c_transforms as C | import mindspore.dataset.vision.c_transforms as C | ||||
| @@ -36,13 +36,13 @@ def test_auto_contrast_py(plot=False): | |||||
| logger.info("Test AutoContrast Python Op") | logger.info("Test AutoContrast Python Op") | ||||
| # Original Images | # Original Images | ||||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| transforms_original = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), | transforms_original = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), | ||||
| F.Resize((224, 224)), | F.Resize((224, 224)), | ||||
| F.ToTensor()]) | F.ToTensor()]) | ||||
| ds_original = ds.map(operations=transforms_original, input_columns="image") | |||||
| ds_original = data_set.map(operations=transforms_original, input_columns="image") | |||||
| ds_original = ds_original.batch(512) | ds_original = ds_original.batch(512) | ||||
| @@ -55,7 +55,7 @@ def test_auto_contrast_py(plot=False): | |||||
| axis=0) | axis=0) | ||||
| # AutoContrast Images | # AutoContrast Images | ||||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| transforms_auto_contrast = \ | transforms_auto_contrast = \ | ||||
| mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), | mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), | ||||
| @@ -63,7 +63,7 @@ def test_auto_contrast_py(plot=False): | |||||
| F.AutoContrast(cutoff=10.0, ignore=[10, 20]), | F.AutoContrast(cutoff=10.0, ignore=[10, 20]), | ||||
| F.ToTensor()]) | F.ToTensor()]) | ||||
| ds_auto_contrast = ds.map(operations=transforms_auto_contrast, input_columns="image") | |||||
| ds_auto_contrast = data_set.map(operations=transforms_auto_contrast, input_columns="image") | |||||
| ds_auto_contrast = ds_auto_contrast.batch(512) | ds_auto_contrast = ds_auto_contrast.batch(512) | ||||
| @@ -96,15 +96,15 @@ def test_auto_contrast_c(plot=False): | |||||
| logger.info("Test AutoContrast C Op") | logger.info("Test AutoContrast C Op") | ||||
| # AutoContrast Images | # AutoContrast Images | ||||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| ds = ds.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"]) | |||||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"]) | |||||
| python_op = F.AutoContrast(cutoff=10.0, ignore=[10, 20]) | python_op = F.AutoContrast(cutoff=10.0, ignore=[10, 20]) | ||||
| c_op = C.AutoContrast(cutoff=10.0, ignore=[10, 20]) | c_op = C.AutoContrast(cutoff=10.0, ignore=[10, 20]) | ||||
| transforms_op = mindspore.dataset.transforms.py_transforms.Compose([lambda img: F.ToPIL()(img.astype(np.uint8)), | transforms_op = mindspore.dataset.transforms.py_transforms.Compose([lambda img: F.ToPIL()(img.astype(np.uint8)), | ||||
| python_op, | python_op, | ||||
| np.array]) | np.array]) | ||||
| ds_auto_contrast_py = ds.map(operations=transforms_op, input_columns="image") | |||||
| ds_auto_contrast_py = data_set.map(operations=transforms_op, input_columns="image") | |||||
| ds_auto_contrast_py = ds_auto_contrast_py.batch(512) | ds_auto_contrast_py = ds_auto_contrast_py.batch(512) | ||||
| @@ -116,10 +116,10 @@ def test_auto_contrast_c(plot=False): | |||||
| image.asnumpy(), | image.asnumpy(), | ||||
| axis=0) | axis=0) | ||||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| ds = ds.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"]) | |||||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"]) | |||||
| ds_auto_contrast_c = ds.map(operations=c_op, input_columns="image") | |||||
| ds_auto_contrast_c = data_set.map(operations=c_op, input_columns="image") | |||||
| ds_auto_contrast_c = ds_auto_contrast_c.batch(512) | ds_auto_contrast_c = ds_auto_contrast_c.batch(512) | ||||
| @@ -153,8 +153,8 @@ def test_auto_contrast_one_channel_c(plot=False): | |||||
| logger.info("Test AutoContrast C Op With One Channel Images") | logger.info("Test AutoContrast C Op With One Channel Images") | ||||
| # AutoContrast Images | # AutoContrast Images | ||||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| ds = ds.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"]) | |||||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"]) | |||||
| python_op = F.AutoContrast() | python_op = F.AutoContrast() | ||||
| c_op = C.AutoContrast() | c_op = C.AutoContrast() | ||||
| # not using F.ToTensor() since it converts to floats | # not using F.ToTensor() since it converts to floats | ||||
| @@ -164,7 +164,7 @@ def test_auto_contrast_one_channel_c(plot=False): | |||||
| python_op, | python_op, | ||||
| np.array]) | np.array]) | ||||
| ds_auto_contrast_py = ds.map(operations=transforms_op, input_columns="image") | |||||
| ds_auto_contrast_py = data_set.map(operations=transforms_op, input_columns="image") | |||||
| ds_auto_contrast_py = ds_auto_contrast_py.batch(512) | ds_auto_contrast_py = ds_auto_contrast_py.batch(512) | ||||
| @@ -176,11 +176,11 @@ def test_auto_contrast_one_channel_c(plot=False): | |||||
| image.asnumpy(), | image.asnumpy(), | ||||
| axis=0) | axis=0) | ||||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| ds = ds.map(operations=[C.Decode(), C.Resize((224, 224)), lambda img: np.array(img[:, :, 0])], | |||||
| input_columns=["image"]) | |||||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224)), lambda img: np.array(img[:, :, 0])], | |||||
| input_columns=["image"]) | |||||
| ds_auto_contrast_c = ds.map(operations=c_op, input_columns="image") | |||||
| ds_auto_contrast_c = data_set.map(operations=c_op, input_columns="image") | |||||
| ds_auto_contrast_c = ds_auto_contrast_c.batch(512) | ds_auto_contrast_c = ds_auto_contrast_c.batch(512) | ||||
| @@ -208,9 +208,9 @@ def test_auto_contrast_mnist_c(plot=False): | |||||
| Test AutoContrast C op with MNIST dataset (Grayscale images) | Test AutoContrast C op with MNIST dataset (Grayscale images) | ||||
| """ | """ | ||||
| logger.info("Test AutoContrast C Op With MNIST Images") | logger.info("Test AutoContrast C Op With MNIST Images") | ||||
| ds = de.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False) | |||||
| ds_auto_contrast_c = ds.map(operations=C.AutoContrast(cutoff=1, ignore=(0, 255)), input_columns="image") | |||||
| ds_orig = de.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False) | |||||
| data_set = ds.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False) | |||||
| ds_auto_contrast_c = data_set.map(operations=C.AutoContrast(cutoff=1, ignore=(0, 255)), input_columns="image") | |||||
| ds_orig = ds.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False) | |||||
| images = [] | images = [] | ||||
| images_trans = [] | images_trans = [] | ||||
| @@ -236,21 +236,21 @@ def test_auto_contrast_invalid_ignore_param_c(): | |||||
| """ | """ | ||||
| logger.info("Test AutoContrast C Op with invalid ignore parameter") | logger.info("Test AutoContrast C Op with invalid ignore parameter") | ||||
| try: | try: | ||||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| ds = ds.map(operations=[C.Decode(), | |||||
| C.Resize((224, 224)), | |||||
| lambda img: np.array(img[:, :, 0])], input_columns=["image"]) | |||||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data_set = data_set.map(operations=[C.Decode(), | |||||
| C.Resize((224, 224)), | |||||
| lambda img: np.array(img[:, :, 0])], input_columns=["image"]) | |||||
| # invalid ignore | # invalid ignore | ||||
| ds = ds.map(operations=C.AutoContrast(ignore=255.5), input_columns="image") | |||||
| data_set = data_set.map(operations=C.AutoContrast(ignore=255.5), input_columns="image") | |||||
| except TypeError as error: | except TypeError as error: | ||||
| logger.info("Got an exception in DE: {}".format(str(error))) | logger.info("Got an exception in DE: {}".format(str(error))) | ||||
| assert "Argument ignore with value 255.5 is not of type" in str(error) | assert "Argument ignore with value 255.5 is not of type" in str(error) | ||||
| try: | try: | ||||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| ds = ds.map(operations=[C.Decode(), C.Resize((224, 224)), | |||||
| lambda img: np.array(img[:, :, 0])], input_columns=["image"]) | |||||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224)), | |||||
| lambda img: np.array(img[:, :, 0])], input_columns=["image"]) | |||||
| # invalid ignore | # invalid ignore | ||||
| ds = ds.map(operations=C.AutoContrast(ignore=(10, 100)), input_columns="image") | |||||
| data_set = data_set.map(operations=C.AutoContrast(ignore=(10, 100)), input_columns="image") | |||||
| except TypeError as error: | except TypeError as error: | ||||
| logger.info("Got an exception in DE: {}".format(str(error))) | logger.info("Got an exception in DE: {}".format(str(error))) | ||||
| assert "Argument ignore with value (10,100) is not of type" in str(error) | assert "Argument ignore with value (10,100) is not of type" in str(error) | ||||
| @@ -262,22 +262,22 @@ def test_auto_contrast_invalid_cutoff_param_c(): | |||||
| """ | """ | ||||
| logger.info("Test AutoContrast C Op with invalid cutoff parameter") | logger.info("Test AutoContrast C Op with invalid cutoff parameter") | ||||
| try: | try: | ||||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| ds = ds.map(operations=[C.Decode(), | |||||
| C.Resize((224, 224)), | |||||
| lambda img: np.array(img[:, :, 0])], input_columns=["image"]) | |||||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data_set = data_set.map(operations=[C.Decode(), | |||||
| C.Resize((224, 224)), | |||||
| lambda img: np.array(img[:, :, 0])], input_columns=["image"]) | |||||
| # invalid ignore | # invalid ignore | ||||
| ds = ds.map(operations=C.AutoContrast(cutoff=-10.0), input_columns="image") | |||||
| data_set = data_set.map(operations=C.AutoContrast(cutoff=-10.0), input_columns="image") | |||||
| except ValueError as error: | except ValueError as error: | ||||
| logger.info("Got an exception in DE: {}".format(str(error))) | logger.info("Got an exception in DE: {}".format(str(error))) | ||||
| assert "Input cutoff is not within the required interval of (0 to 100)." in str(error) | assert "Input cutoff is not within the required interval of (0 to 100)." in str(error) | ||||
| try: | try: | ||||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| ds = ds.map(operations=[C.Decode(), | |||||
| C.Resize((224, 224)), | |||||
| lambda img: np.array(img[:, :, 0])], input_columns=["image"]) | |||||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data_set = data_set.map(operations=[C.Decode(), | |||||
| C.Resize((224, 224)), | |||||
| lambda img: np.array(img[:, :, 0])], input_columns=["image"]) | |||||
| # invalid ignore | # invalid ignore | ||||
| ds = ds.map(operations=C.AutoContrast(cutoff=120.0), input_columns="image") | |||||
| data_set = data_set.map(operations=C.AutoContrast(cutoff=120.0), input_columns="image") | |||||
| except ValueError as error: | except ValueError as error: | ||||
| logger.info("Got an exception in DE: {}".format(str(error))) | logger.info("Got an exception in DE: {}".format(str(error))) | ||||
| assert "Input cutoff is not within the required interval of (0 to 100)." in str(error) | assert "Input cutoff is not within the required interval of (0 to 100)." in str(error) | ||||
| @@ -289,22 +289,24 @@ def test_auto_contrast_invalid_ignore_param_py(): | |||||
| """ | """ | ||||
| logger.info("Test AutoContrast python Op with invalid ignore parameter") | logger.info("Test AutoContrast python Op with invalid ignore parameter") | ||||
| try: | try: | ||||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| ds = ds.map(operations=[mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), | |||||
| F.Resize((224, 224)), | |||||
| F.AutoContrast(ignore=255.5), | |||||
| F.ToTensor()])], | |||||
| input_columns=["image"]) | |||||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data_set = data_set.map(operations=[mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), | |||||
| F.Resize((224, 224)), | |||||
| F.AutoContrast( | |||||
| ignore=255.5), | |||||
| F.ToTensor()])], | |||||
| input_columns=["image"]) | |||||
| except TypeError as error: | except TypeError as error: | ||||
| logger.info("Got an exception in DE: {}".format(str(error))) | logger.info("Got an exception in DE: {}".format(str(error))) | ||||
| assert "Argument ignore with value 255.5 is not of type" in str(error) | assert "Argument ignore with value 255.5 is not of type" in str(error) | ||||
| try: | try: | ||||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| ds = ds.map(operations=[mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), | |||||
| F.Resize((224, 224)), | |||||
| F.AutoContrast(ignore=(10, 100)), | |||||
| F.ToTensor()])], | |||||
| input_columns=["image"]) | |||||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data_set = data_set.map(operations=[mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), | |||||
| F.Resize((224, 224)), | |||||
| F.AutoContrast( | |||||
| ignore=(10, 100)), | |||||
| F.ToTensor()])], | |||||
| input_columns=["image"]) | |||||
| except TypeError as error: | except TypeError as error: | ||||
| logger.info("Got an exception in DE: {}".format(str(error))) | logger.info("Got an exception in DE: {}".format(str(error))) | ||||
| assert "Argument ignore with value (10,100) is not of type" in str(error) | assert "Argument ignore with value (10,100) is not of type" in str(error) | ||||
| @@ -316,18 +318,19 @@ def test_auto_contrast_invalid_cutoff_param_py(): | |||||
| """ | """ | ||||
| logger.info("Test AutoContrast python Op with invalid cutoff parameter") | logger.info("Test AutoContrast python Op with invalid cutoff parameter") | ||||
| try: | try: | ||||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| ds = ds.map(operations=[mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), | |||||
| F.Resize((224, 224)), | |||||
| F.AutoContrast(cutoff=-10.0), | |||||
| F.ToTensor()])], | |||||
| input_columns=["image"]) | |||||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data_set = data_set.map(operations=[mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), | |||||
| F.Resize((224, 224)), | |||||
| F.AutoContrast( | |||||
| cutoff=-10.0), | |||||
| F.ToTensor()])], | |||||
| input_columns=["image"]) | |||||
| except ValueError as error: | except ValueError as error: | ||||
| logger.info("Got an exception in DE: {}".format(str(error))) | logger.info("Got an exception in DE: {}".format(str(error))) | ||||
| assert "Input cutoff is not within the required interval of (0 to 100)." in str(error) | assert "Input cutoff is not within the required interval of (0 to 100)." in str(error) | ||||
| try: | try: | ||||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| ds = ds.map( | |||||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data_set = data_set.map( | |||||
| operations=[mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), | operations=[mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), | ||||
| F.Resize((224, 224)), | F.Resize((224, 224)), | ||||
| F.AutoContrast(cutoff=120.0), | F.AutoContrast(cutoff=120.0), | ||||
| @@ -17,7 +17,7 @@ Testing Equalize op in DE | |||||
| """ | """ | ||||
| import numpy as np | import numpy as np | ||||
| import mindspore.dataset.engine as de | |||||
| import mindspore.dataset as ds | |||||
| import mindspore.dataset.transforms.py_transforms | import mindspore.dataset.transforms.py_transforms | ||||
| import mindspore.dataset.vision.c_transforms as C | import mindspore.dataset.vision.c_transforms as C | ||||
| import mindspore.dataset.vision.py_transforms as F | import mindspore.dataset.vision.py_transforms as F | ||||
| @@ -37,13 +37,13 @@ def test_equalize_py(plot=False): | |||||
| logger.info("Test Equalize") | logger.info("Test Equalize") | ||||
| # Original Images | # Original Images | ||||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| transforms_original = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), | transforms_original = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), | ||||
| F.Resize((224, 224)), | F.Resize((224, 224)), | ||||
| F.ToTensor()]) | F.ToTensor()]) | ||||
| ds_original = ds.map(operations=transforms_original, input_columns="image") | |||||
| ds_original = data_set.map(operations=transforms_original, input_columns="image") | |||||
| ds_original = ds_original.batch(512) | ds_original = ds_original.batch(512) | ||||
| @@ -56,14 +56,14 @@ def test_equalize_py(plot=False): | |||||
| axis=0) | axis=0) | ||||
| # Color Equalized Images | # Color Equalized Images | ||||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| transforms_equalize = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), | transforms_equalize = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), | ||||
| F.Resize((224, 224)), | F.Resize((224, 224)), | ||||
| F.Equalize(), | F.Equalize(), | ||||
| F.ToTensor()]) | F.ToTensor()]) | ||||
| ds_equalize = ds.map(operations=transforms_equalize, input_columns="image") | |||||
| ds_equalize = data_set.map(operations=transforms_equalize, input_columns="image") | |||||
| ds_equalize = ds_equalize.batch(512) | ds_equalize = ds_equalize.batch(512) | ||||
| @@ -92,11 +92,11 @@ def test_equalize_c(plot=False): | |||||
| logger.info("Test Equalize cpp op") | logger.info("Test Equalize cpp op") | ||||
| # Original Images | # Original Images | ||||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| transforms_original = [C.Decode(), C.Resize(size=[224, 224])] | transforms_original = [C.Decode(), C.Resize(size=[224, 224])] | ||||
| ds_original = ds.map(operations=transforms_original, input_columns="image") | |||||
| ds_original = data_set.map(operations=transforms_original, input_columns="image") | |||||
| ds_original = ds_original.batch(512) | ds_original = ds_original.batch(512) | ||||
| @@ -109,12 +109,12 @@ def test_equalize_c(plot=False): | |||||
| axis=0) | axis=0) | ||||
| # Equalize Images | # Equalize Images | ||||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| transform_equalize = [C.Decode(), C.Resize(size=[224, 224]), | transform_equalize = [C.Decode(), C.Resize(size=[224, 224]), | ||||
| C.Equalize()] | C.Equalize()] | ||||
| ds_equalize = ds.map(operations=transform_equalize, input_columns="image") | |||||
| ds_equalize = data_set.map(operations=transform_equalize, input_columns="image") | |||||
| ds_equalize = ds_equalize.batch(512) | ds_equalize = ds_equalize.batch(512) | ||||
| @@ -142,10 +142,10 @@ def test_equalize_py_c(plot=False): | |||||
| logger.info("Test Equalize cpp and python op") | logger.info("Test Equalize cpp and python op") | ||||
| # equalize Images in cpp | # equalize Images in cpp | ||||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| ds = ds.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"]) | |||||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"]) | |||||
| ds_c_equalize = ds.map(operations=C.Equalize(), input_columns="image") | |||||
| ds_c_equalize = data_set.map(operations=C.Equalize(), input_columns="image") | |||||
| ds_c_equalize = ds_c_equalize.batch(512) | ds_c_equalize = ds_c_equalize.batch(512) | ||||
| @@ -158,15 +158,15 @@ def test_equalize_py_c(plot=False): | |||||
| axis=0) | axis=0) | ||||
| # Equalize images in python | # Equalize images in python | ||||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| ds = ds.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"]) | |||||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"]) | |||||
| transforms_p_equalize = mindspore.dataset.transforms.py_transforms.Compose([lambda img: img.astype(np.uint8), | transforms_p_equalize = mindspore.dataset.transforms.py_transforms.Compose([lambda img: img.astype(np.uint8), | ||||
| F.ToPIL(), | F.ToPIL(), | ||||
| F.Equalize(), | F.Equalize(), | ||||
| np.array]) | np.array]) | ||||
| ds_p_equalize = ds.map(operations=transforms_p_equalize, input_columns="image") | |||||
| ds_p_equalize = data_set.map(operations=transforms_p_equalize, input_columns="image") | |||||
| ds_p_equalize = ds_p_equalize.batch(512) | ds_p_equalize = ds_p_equalize.batch(512) | ||||
| @@ -197,11 +197,11 @@ def test_equalize_one_channel(): | |||||
| c_op = C.Equalize() | c_op = C.Equalize() | ||||
| try: | try: | ||||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| ds = ds.map(operations=[C.Decode(), C.Resize((224, 224)), | |||||
| lambda img: np.array(img[:, :, 0])], input_columns=["image"]) | |||||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224)), | |||||
| lambda img: np.array(img[:, :, 0])], input_columns=["image"]) | |||||
| ds.map(operations=c_op, input_columns="image") | |||||
| data_set.map(operations=c_op, input_columns="image") | |||||
| except RuntimeError as e: | except RuntimeError as e: | ||||
| logger.info("Got an exception in DE: {}".format(str(e))) | logger.info("Got an exception in DE: {}".format(str(e))) | ||||
| @@ -213,9 +213,9 @@ def test_equalize_mnist_c(plot=False): | |||||
| Test Equalize C op with MNIST dataset (Grayscale images) | Test Equalize C op with MNIST dataset (Grayscale images) | ||||
| """ | """ | ||||
| logger.info("Test Equalize C Op With MNIST Images") | logger.info("Test Equalize C Op With MNIST Images") | ||||
| ds = de.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False) | |||||
| ds_equalize_c = ds.map(operations=C.Equalize(), input_columns="image") | |||||
| ds_orig = de.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False) | |||||
| data_set = ds.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False) | |||||
| ds_equalize_c = data_set.map(operations=C.Equalize(), input_columns="image") | |||||
| ds_orig = ds.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False) | |||||
| images = [] | images = [] | ||||
| images_trans = [] | images_trans = [] | ||||
| @@ -242,7 +242,7 @@ def test_equalize_md5_py(): | |||||
| logger.info("Test Equalize") | logger.info("Test Equalize") | ||||
| # First dataset | # First dataset | ||||
| data1 = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data1 = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| transforms = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), | transforms = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), | ||||
| F.Equalize(), | F.Equalize(), | ||||
| F.ToTensor()]) | F.ToTensor()]) | ||||
| @@ -260,14 +260,14 @@ def test_equalize_md5_c(): | |||||
| logger.info("Test Equalize cpp op with md5 check") | logger.info("Test Equalize cpp op with md5 check") | ||||
| # Generate dataset | # Generate dataset | ||||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| transforms_equalize = [C.Decode(), | transforms_equalize = [C.Decode(), | ||||
| C.Resize(size=[224, 224]), | C.Resize(size=[224, 224]), | ||||
| C.Equalize(), | C.Equalize(), | ||||
| F.ToTensor()] | F.ToTensor()] | ||||
| data = ds.map(operations=transforms_equalize, input_columns="image") | |||||
| data = data_set.map(operations=transforms_equalize, input_columns="image") | |||||
| # Compare with expected md5 from images | # Compare with expected md5 from images | ||||
| filename = "equalize_01_result_c.npz" | filename = "equalize_01_result_c.npz" | ||||
| save_and_check_md5(data, filename, generate_golden=GENERATE_GOLDEN) | save_and_check_md5(data, filename, generate_golden=GENERATE_GOLDEN) | ||||
| @@ -17,7 +17,7 @@ Testing Invert op in DE | |||||
| """ | """ | ||||
| import numpy as np | import numpy as np | ||||
| import mindspore.dataset.engine as de | |||||
| import mindspore.dataset as ds | |||||
| import mindspore.dataset.transforms.py_transforms | import mindspore.dataset.transforms.py_transforms | ||||
| import mindspore.dataset.vision.py_transforms as F | import mindspore.dataset.vision.py_transforms as F | ||||
| import mindspore.dataset.vision.c_transforms as C | import mindspore.dataset.vision.c_transforms as C | ||||
| @@ -36,13 +36,13 @@ def test_invert_py(plot=False): | |||||
| logger.info("Test Invert Python op") | logger.info("Test Invert Python op") | ||||
| # Original Images | # Original Images | ||||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| transforms_original = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), | transforms_original = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), | ||||
| F.Resize((224, 224)), | F.Resize((224, 224)), | ||||
| F.ToTensor()]) | F.ToTensor()]) | ||||
| ds_original = ds.map(operations=transforms_original, input_columns="image") | |||||
| ds_original = data_set.map(operations=transforms_original, input_columns="image") | |||||
| ds_original = ds_original.batch(512) | ds_original = ds_original.batch(512) | ||||
| @@ -55,14 +55,14 @@ def test_invert_py(plot=False): | |||||
| axis=0) | axis=0) | ||||
| # Color Inverted Images | # Color Inverted Images | ||||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| transforms_invert = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), | transforms_invert = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), | ||||
| F.Resize((224, 224)), | F.Resize((224, 224)), | ||||
| F.Invert(), | F.Invert(), | ||||
| F.ToTensor()]) | F.ToTensor()]) | ||||
| ds_invert = ds.map(operations=transforms_invert, input_columns="image") | |||||
| ds_invert = data_set.map(operations=transforms_invert, input_columns="image") | |||||
| ds_invert = ds_invert.batch(512) | ds_invert = ds_invert.batch(512) | ||||
| @@ -91,11 +91,11 @@ def test_invert_c(plot=False): | |||||
| logger.info("Test Invert cpp op") | logger.info("Test Invert cpp op") | ||||
| # Original Images | # Original Images | ||||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| transforms_original = [C.Decode(), C.Resize(size=[224, 224])] | transforms_original = [C.Decode(), C.Resize(size=[224, 224])] | ||||
| ds_original = ds.map(operations=transforms_original, input_columns="image") | |||||
| ds_original = data_set.map(operations=transforms_original, input_columns="image") | |||||
| ds_original = ds_original.batch(512) | ds_original = ds_original.batch(512) | ||||
| @@ -108,12 +108,12 @@ def test_invert_c(plot=False): | |||||
| axis=0) | axis=0) | ||||
| # Invert Images | # Invert Images | ||||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| transform_invert = [C.Decode(), C.Resize(size=[224, 224]), | transform_invert = [C.Decode(), C.Resize(size=[224, 224]), | ||||
| C.Invert()] | C.Invert()] | ||||
| ds_invert = ds.map(operations=transform_invert, input_columns="image") | |||||
| ds_invert = data_set.map(operations=transform_invert, input_columns="image") | |||||
| ds_invert = ds_invert.batch(512) | ds_invert = ds_invert.batch(512) | ||||
| @@ -141,10 +141,10 @@ def test_invert_py_c(plot=False): | |||||
| logger.info("Test Invert cpp and python op") | logger.info("Test Invert cpp and python op") | ||||
| # Invert Images in cpp | # Invert Images in cpp | ||||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| ds = ds.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"]) | |||||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"]) | |||||
| ds_c_invert = ds.map(operations=C.Invert(), input_columns="image") | |||||
| ds_c_invert = data_set.map(operations=C.Invert(), input_columns="image") | |||||
| ds_c_invert = ds_c_invert.batch(512) | ds_c_invert = ds_c_invert.batch(512) | ||||
| @@ -157,15 +157,15 @@ def test_invert_py_c(plot=False): | |||||
| axis=0) | axis=0) | ||||
| # invert images in python | # invert images in python | ||||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| ds = ds.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"]) | |||||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"]) | |||||
| transforms_p_invert = mindspore.dataset.transforms.py_transforms.Compose([lambda img: img.astype(np.uint8), | transforms_p_invert = mindspore.dataset.transforms.py_transforms.Compose([lambda img: img.astype(np.uint8), | ||||
| F.ToPIL(), | F.ToPIL(), | ||||
| F.Invert(), | F.Invert(), | ||||
| np.array]) | np.array]) | ||||
| ds_p_invert = ds.map(operations=transforms_p_invert, input_columns="image") | |||||
| ds_p_invert = data_set.map(operations=transforms_p_invert, input_columns="image") | |||||
| ds_p_invert = ds_p_invert.batch(512) | ds_p_invert = ds_p_invert.batch(512) | ||||
| @@ -196,11 +196,11 @@ def test_invert_one_channel(): | |||||
| c_op = C.Invert() | c_op = C.Invert() | ||||
| try: | try: | ||||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| ds = ds.map(operations=[C.Decode(), C.Resize((224, 224)), | |||||
| lambda img: np.array(img[:, :, 0])], input_columns=["image"]) | |||||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224)), | |||||
| lambda img: np.array(img[:, :, 0])], input_columns=["image"]) | |||||
| ds.map(operations=c_op, input_columns="image") | |||||
| data_set.map(operations=c_op, input_columns="image") | |||||
| except RuntimeError as e: | except RuntimeError as e: | ||||
| logger.info("Got an exception in DE: {}".format(str(e))) | logger.info("Got an exception in DE: {}".format(str(e))) | ||||
| @@ -214,13 +214,13 @@ def test_invert_md5_py(): | |||||
| logger.info("Test Invert python op with md5 check") | logger.info("Test Invert python op with md5 check") | ||||
| # Generate dataset | # Generate dataset | ||||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| transforms_invert = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), | transforms_invert = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), | ||||
| F.Invert(), | F.Invert(), | ||||
| F.ToTensor()]) | F.ToTensor()]) | ||||
| data = ds.map(operations=transforms_invert, input_columns="image") | |||||
| data = data_set.map(operations=transforms_invert, input_columns="image") | |||||
| # Compare with expected md5 from images | # Compare with expected md5 from images | ||||
| filename = "invert_01_result_py.npz" | filename = "invert_01_result_py.npz" | ||||
| save_and_check_md5(data, filename, generate_golden=GENERATE_GOLDEN) | save_and_check_md5(data, filename, generate_golden=GENERATE_GOLDEN) | ||||
| @@ -233,14 +233,14 @@ def test_invert_md5_c(): | |||||
| logger.info("Test Invert cpp op with md5 check") | logger.info("Test Invert cpp op with md5 check") | ||||
| # Generate dataset | # Generate dataset | ||||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| transforms_invert = [C.Decode(), | transforms_invert = [C.Decode(), | ||||
| C.Resize(size=[224, 224]), | C.Resize(size=[224, 224]), | ||||
| C.Invert(), | C.Invert(), | ||||
| F.ToTensor()] | F.ToTensor()] | ||||
| data = ds.map(operations=transforms_invert, input_columns="image") | |||||
| data = data_set.map(operations=transforms_invert, input_columns="image") | |||||
| # Compare with expected md5 from images | # Compare with expected md5 from images | ||||
| filename = "invert_01_result_c.npz" | filename = "invert_01_result_c.npz" | ||||
| save_and_check_md5(data, filename, generate_golden=GENERATE_GOLDEN) | save_and_check_md5(data, filename, generate_golden=GENERATE_GOLDEN) | ||||
| @@ -19,7 +19,6 @@ import numpy as np | |||||
| import pytest | import pytest | ||||
| import mindspore.dataset as ds | import mindspore.dataset as ds | ||||
| import mindspore.dataset.engine as de | |||||
| import mindspore.dataset.transforms.py_transforms | import mindspore.dataset.transforms.py_transforms | ||||
| import mindspore.dataset.vision.c_transforms as vision | import mindspore.dataset.vision.c_transforms as vision | ||||
| import mindspore.dataset.vision.py_transforms as F | import mindspore.dataset.vision.py_transforms as F | ||||
| @@ -44,7 +43,7 @@ def test_random_color_py(degrees=(0.1, 1.9), plot=False): | |||||
| logger.info("Test RandomColor") | logger.info("Test RandomColor") | ||||
| # Original Images | # Original Images | ||||
| data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| transforms_original = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), | transforms_original = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), | ||||
| F.Resize((224, 224)), | F.Resize((224, 224)), | ||||
| @@ -63,7 +62,7 @@ def test_random_color_py(degrees=(0.1, 1.9), plot=False): | |||||
| axis=0) | axis=0) | ||||
| # Random Color Adjusted Images | # Random Color Adjusted Images | ||||
| data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| transforms_random_color = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), | transforms_random_color = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), | ||||
| F.Resize((224, 224)), | F.Resize((224, 224)), | ||||
| @@ -146,7 +145,7 @@ def test_random_color_py_md5(): | |||||
| original_num_parallel_workers = config_get_set_num_parallel_workers(1) | original_num_parallel_workers = config_get_set_num_parallel_workers(1) | ||||
| # Generate dataset | # Generate dataset | ||||
| data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| transforms = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), | transforms = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), | ||||
| F.RandomColor((2.0, 2.5)), | F.RandomColor((2.0, 2.5)), | ||||
| @@ -234,7 +233,7 @@ def test_random_color_c_errors(): | |||||
| assert "degrees must be a sequence with length 2." in str(error_info.value) | assert "degrees must be a sequence with length 2." in str(error_info.value) | ||||
| # RandomColor Cpp Op will fail with one channel input | # RandomColor Cpp Op will fail with one channel input | ||||
| mnist_ds = de.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False) | |||||
| mnist_ds = ds.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False) | |||||
| mnist_ds = mnist_ds.map(operations=vision.RandomColor(), input_columns="image") | mnist_ds = mnist_ds.map(operations=vision.RandomColor(), input_columns="image") | ||||
| with pytest.raises(RuntimeError) as error_info: | with pytest.raises(RuntimeError) as error_info: | ||||
| @@ -17,7 +17,6 @@ Testing RandomSharpness op in DE | |||||
| """ | """ | ||||
| import numpy as np | import numpy as np | ||||
| import mindspore.dataset as ds | import mindspore.dataset as ds | ||||
| import mindspore.dataset.engine as de | |||||
| import mindspore.dataset.transforms.py_transforms | import mindspore.dataset.transforms.py_transforms | ||||
| import mindspore.dataset.vision.py_transforms as F | import mindspore.dataset.vision.py_transforms as F | ||||
| import mindspore.dataset.vision.c_transforms as C | import mindspore.dataset.vision.c_transforms as C | ||||
| @@ -38,7 +37,7 @@ def test_random_sharpness_py(degrees=(0.7, 0.7), plot=False): | |||||
| logger.info("Test RandomSharpness python op") | logger.info("Test RandomSharpness python op") | ||||
| # Original Images | # Original Images | ||||
| data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| transforms_original = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), | transforms_original = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), | ||||
| F.Resize((224, 224)), | F.Resize((224, 224)), | ||||
| @@ -57,7 +56,7 @@ def test_random_sharpness_py(degrees=(0.7, 0.7), plot=False): | |||||
| axis=0) | axis=0) | ||||
| # Random Sharpness Adjusted Images | # Random Sharpness Adjusted Images | ||||
| data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| py_op = F.RandomSharpness() | py_op = F.RandomSharpness() | ||||
| if degrees is not None: | if degrees is not None: | ||||
| @@ -108,7 +107,7 @@ def test_random_sharpness_py_md5(): | |||||
| transform = mindspore.dataset.transforms.py_transforms.Compose(transforms) | transform = mindspore.dataset.transforms.py_transforms.Compose(transforms) | ||||
| # Generate dataset | # Generate dataset | ||||
| data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data = data.map(operations=transform, input_columns=["image"]) | data = data.map(operations=transform, input_columns=["image"]) | ||||
| # check results with md5 comparison | # check results with md5 comparison | ||||
| @@ -128,7 +127,7 @@ def test_random_sharpness_c(degrees=(1.6, 1.6), plot=False): | |||||
| logger.info("Test RandomSharpness cpp op") | logger.info("Test RandomSharpness cpp op") | ||||
| # Original Images | # Original Images | ||||
| data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| transforms_original = [C.Decode(), | transforms_original = [C.Decode(), | ||||
| C.Resize((224, 224))] | C.Resize((224, 224))] | ||||
| @@ -146,7 +145,7 @@ def test_random_sharpness_c(degrees=(1.6, 1.6), plot=False): | |||||
| axis=0) | axis=0) | ||||
| # Random Sharpness Adjusted Images | # Random Sharpness Adjusted Images | ||||
| data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| c_op = C.RandomSharpness() | c_op = C.RandomSharpness() | ||||
| if degrees is not None: | if degrees is not None: | ||||
| @@ -194,7 +193,7 @@ def test_random_sharpness_c_md5(): | |||||
| ] | ] | ||||
| # Generate dataset | # Generate dataset | ||||
| data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data = data.map(operations=transforms, input_columns=["image"]) | data = data.map(operations=transforms, input_columns=["image"]) | ||||
| # check results with md5 comparison | # check results with md5 comparison | ||||
| @@ -213,7 +212,7 @@ def test_random_sharpness_c_py(degrees=(1.0, 1.0), plot=False): | |||||
| logger.info("Test RandomSharpness C and python Op") | logger.info("Test RandomSharpness C and python Op") | ||||
| # RandomSharpness Images | # RandomSharpness Images | ||||
| data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data = data.map(operations=[C.Decode(), C.Resize((200, 300))], input_columns=["image"]) | data = data.map(operations=[C.Decode(), C.Resize((200, 300))], input_columns=["image"]) | ||||
| python_op = F.RandomSharpness(degrees) | python_op = F.RandomSharpness(degrees) | ||||
| @@ -236,7 +235,7 @@ def test_random_sharpness_c_py(degrees=(1.0, 1.0), plot=False): | |||||
| image, | image, | ||||
| axis=0) | axis=0) | ||||
| data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data = data.map(operations=[C.Decode(), C.Resize((200, 300))], input_columns=["image"]) | data = data.map(operations=[C.Decode(), C.Resize((200, 300))], input_columns=["image"]) | ||||
| ds_images_random_sharpness_c = data.map(operations=c_op, input_columns="image") | ds_images_random_sharpness_c = data.map(operations=c_op, input_columns="image") | ||||
| @@ -271,10 +270,10 @@ def test_random_sharpness_one_channel_c(degrees=(1.4, 1.4), plot=False): | |||||
| if degrees is not None: | if degrees is not None: | ||||
| c_op = C.RandomSharpness(degrees) | c_op = C.RandomSharpness(degrees) | ||||
| # RandomSharpness Images | # RandomSharpness Images | ||||
| data = de.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False) | |||||
| data = ds.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False) | |||||
| ds_random_sharpness_c = data.map(operations=c_op, input_columns="image") | ds_random_sharpness_c = data.map(operations=c_op, input_columns="image") | ||||
| # Original images | # Original images | ||||
| data = de.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False) | |||||
| data = ds.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False) | |||||
| images = [] | images = [] | ||||
| images_trans = [] | images_trans = [] | ||||
| @@ -296,7 +295,7 @@ def test_random_sharpness_invalid_params(): | |||||
| """ | """ | ||||
| logger.info("Test RandomSharpness with invalid input parameters.") | logger.info("Test RandomSharpness with invalid input parameters.") | ||||
| try: | try: | ||||
| data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data = data.map(operations=[C.Decode(), C.Resize((224, 224)), | data = data.map(operations=[C.Decode(), C.Resize((224, 224)), | ||||
| C.RandomSharpness(10)], input_columns=["image"]) | C.RandomSharpness(10)], input_columns=["image"]) | ||||
| except TypeError as error: | except TypeError as error: | ||||
| @@ -304,7 +303,7 @@ def test_random_sharpness_invalid_params(): | |||||
| assert "tuple" in str(error) | assert "tuple" in str(error) | ||||
| try: | try: | ||||
| data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data = data.map(operations=[C.Decode(), C.Resize((224, 224)), | data = data.map(operations=[C.Decode(), C.Resize((224, 224)), | ||||
| C.RandomSharpness((-10, 10))], input_columns=["image"]) | C.RandomSharpness((-10, 10))], input_columns=["image"]) | ||||
| except ValueError as error: | except ValueError as error: | ||||
| @@ -312,7 +311,7 @@ def test_random_sharpness_invalid_params(): | |||||
| assert "interval" in str(error) | assert "interval" in str(error) | ||||
| try: | try: | ||||
| data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data = data.map(operations=[C.Decode(), C.Resize((224, 224)), | data = data.map(operations=[C.Decode(), C.Resize((224, 224)), | ||||
| C.RandomSharpness((10, 5))], input_columns=["image"]) | C.RandomSharpness((10, 5))], input_columns=["image"]) | ||||
| except ValueError as error: | except ValueError as error: | ||||
| @@ -17,7 +17,6 @@ Testing RandomSolarizeOp op in DE | |||||
| """ | """ | ||||
| import pytest | import pytest | ||||
| import mindspore.dataset as ds | import mindspore.dataset as ds | ||||
| import mindspore.dataset.engine as de | |||||
| import mindspore.dataset.vision.c_transforms as vision | import mindspore.dataset.vision.c_transforms as vision | ||||
| from mindspore import log as logger | from mindspore import log as logger | ||||
| from util import visualize_list, save_and_check_md5, config_get_set_seed, config_get_set_num_parallel_workers, \ | from util import visualize_list, save_and_check_md5, config_get_set_seed, config_get_set_num_parallel_workers, \ | ||||
| @@ -78,8 +77,8 @@ def test_random_solarize_mnist(plot=False, run_golden=True): | |||||
| Test RandomSolarize op with MNIST dataset (Grayscale images) | Test RandomSolarize op with MNIST dataset (Grayscale images) | ||||
| """ | """ | ||||
| mnist_1 = de.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False) | |||||
| mnist_2 = de.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False) | |||||
| mnist_1 = ds.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False) | |||||
| mnist_2 = ds.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False) | |||||
| mnist_2 = mnist_2.map(operations=vision.RandomSolarize((0, 255)), input_columns="image") | mnist_2 = mnist_2.map(operations=vision.RandomSolarize((0, 255)), input_columns="image") | ||||
| images = [] | images = [] | ||||
| @@ -18,7 +18,7 @@ Testing UniformAugment in DE | |||||
| import numpy as np | import numpy as np | ||||
| import pytest | import pytest | ||||
| import mindspore.dataset.engine as de | |||||
| import mindspore.dataset as ds | |||||
| import mindspore.dataset.transforms.py_transforms | import mindspore.dataset.transforms.py_transforms | ||||
| import mindspore.dataset.vision.c_transforms as C | import mindspore.dataset.vision.c_transforms as C | ||||
| import mindspore.dataset.vision.py_transforms as F | import mindspore.dataset.vision.py_transforms as F | ||||
| @@ -35,13 +35,13 @@ def test_uniform_augment(plot=False, num_ops=2): | |||||
| logger.info("Test UniformAugment") | logger.info("Test UniformAugment") | ||||
| # Original Images | # Original Images | ||||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| transforms_original = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), | transforms_original = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), | ||||
| F.Resize((224, 224)), | F.Resize((224, 224)), | ||||
| F.ToTensor()]) | F.ToTensor()]) | ||||
| ds_original = ds.map(operations=transforms_original, input_columns="image") | |||||
| ds_original = data_set.map(operations=transforms_original, input_columns="image") | |||||
| ds_original = ds_original.batch(512) | ds_original = ds_original.batch(512) | ||||
| @@ -54,7 +54,7 @@ def test_uniform_augment(plot=False, num_ops=2): | |||||
| axis=0) | axis=0) | ||||
| # UniformAugment Images | # UniformAugment Images | ||||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| transform_list = [F.RandomRotation(45), | transform_list = [F.RandomRotation(45), | ||||
| F.RandomColor(), | F.RandomColor(), | ||||
| @@ -70,7 +70,7 @@ def test_uniform_augment(plot=False, num_ops=2): | |||||
| num_ops=num_ops), | num_ops=num_ops), | ||||
| F.ToTensor()]) | F.ToTensor()]) | ||||
| ds_ua = ds.map(operations=transforms_ua, input_columns="image") | |||||
| ds_ua = data_set.map(operations=transforms_ua, input_columns="image") | |||||
| ds_ua = ds_ua.batch(512) | ds_ua = ds_ua.batch(512) | ||||
| @@ -99,12 +99,12 @@ def test_cpp_uniform_augment(plot=False, num_ops=2): | |||||
| logger.info("Test CPP UniformAugment") | logger.info("Test CPP UniformAugment") | ||||
| # Original Images | # Original Images | ||||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| transforms_original = [C.Decode(), C.Resize(size=[224, 224]), | transforms_original = [C.Decode(), C.Resize(size=[224, 224]), | ||||
| F.ToTensor()] | F.ToTensor()] | ||||
| ds_original = ds.map(operations=transforms_original, input_columns="image") | |||||
| ds_original = data_set.map(operations=transforms_original, input_columns="image") | |||||
| ds_original = ds_original.batch(512) | ds_original = ds_original.batch(512) | ||||
| @@ -117,7 +117,7 @@ def test_cpp_uniform_augment(plot=False, num_ops=2): | |||||
| axis=0) | axis=0) | ||||
| # UniformAugment Images | # UniformAugment Images | ||||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||||
| transforms_ua = [C.RandomCrop(size=[224, 224], padding=[32, 32, 32, 32]), | transforms_ua = [C.RandomCrop(size=[224, 224], padding=[32, 32, 32, 32]), | ||||
| C.RandomHorizontalFlip(), | C.RandomHorizontalFlip(), | ||||
| C.RandomVerticalFlip(), | C.RandomVerticalFlip(), | ||||
| @@ -130,7 +130,7 @@ def test_cpp_uniform_augment(plot=False, num_ops=2): | |||||
| uni_aug, | uni_aug, | ||||
| F.ToTensor()] | F.ToTensor()] | ||||
| ds_ua = ds.map(operations=transforms_all, input_columns="image", num_parallel_workers=1) | |||||
| ds_ua = data_set.map(operations=transforms_all, input_columns="image", num_parallel_workers=1) | |||||
| ds_ua = ds_ua.batch(512) | ds_ua = ds_ua.batch(512) | ||||
| @@ -240,7 +240,7 @@ def test_cpp_uniform_augment_random_crop_badinput(num_ops=1): | |||||
| logger.info("Test CPP UniformAugment with random_crop bad input") | logger.info("Test CPP UniformAugment with random_crop bad input") | ||||
| batch_size = 2 | batch_size = 2 | ||||
| cifar10_dir = "../data/dataset/testCifar10Data" | cifar10_dir = "../data/dataset/testCifar10Data" | ||||
| ds1 = de.Cifar10Dataset(cifar10_dir, shuffle=False) # shape = [32,32,3] | |||||
| ds1 = ds.Cifar10Dataset(cifar10_dir, shuffle=False) # shape = [32,32,3] | |||||
| transforms_ua = [ | transforms_ua = [ | ||||
| # Note: crop size [224, 224] > image size [32, 32] | # Note: crop size [224, 224] > image size [32, 32] | ||||