| @@ -14,7 +14,7 @@ | |||
| # ============================================================================ | |||
| """generate dataloader and data processing entry""" | |||
| import mindspore.dataset.engine as de | |||
| import mindspore.dataset as ds | |||
| from src.utils import DistributedSampler | |||
| @@ -32,7 +32,7 @@ def GetDataLoader(per_batch_size, | |||
| """ | |||
| centerface_gen = CenterfaceDataset(config=config, split=split) | |||
| sampler = DistributedSampler(centerface_gen, rank, group_size, shuffle=(split == 'train')) # user defined sampling strategy | |||
| de_dataset = de.GeneratorDataset(centerface_gen, ["image", "anns"], sampler=sampler, num_parallel_workers=16) | |||
| de_dataset = ds.GeneratorDataset(centerface_gen, ["image", "anns"], sampler=sampler, num_parallel_workers=16) | |||
| if group_size > 1: | |||
| num_parallel_workers = 24 | |||
| @@ -17,7 +17,7 @@ Data operations, will be used in train.py and eval.py | |||
| """ | |||
| import os | |||
| import mindspore.dataset.engine as de | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.vision.c_transforms as C | |||
| from src.dataset_utils import lucky, noise_blur, noise_speckle, noise_gamma, noise_gaussian, noise_salt_pepper, \ | |||
| shift_color, enhance_brightness, enhance_sharpness, enhance_contrast, enhance_color, gaussian_blur, \ | |||
| @@ -26,6 +26,7 @@ from src.dataset_utils import lucky, noise_blur, noise_speckle, noise_gamma, noi | |||
| import cv2 | |||
| import numpy as np | |||
| cv2.setNumThreads(0) | |||
| image_height = None | |||
| @@ -179,23 +180,24 @@ def create_dataset_train(mindrecord_file_pos, config): | |||
| rank_id = int(os.getenv("RANK_ID", '0')) | |||
| decode = C.Decode() | |||
| ds = de.MindDataset(mindrecord_file_pos, columns_list=["image", "label"], num_parallel_workers=4, | |||
| num_shards=rank_size, shard_id=rank_id, shuffle=True) | |||
| ds = ds.map(operations=decode, input_columns=["image"], num_parallel_workers=8) | |||
| data_set = ds.MindDataset(mindrecord_file_pos, columns_list=["image", "label"], num_parallel_workers=4, | |||
| num_shards=rank_size, shard_id=rank_id, shuffle=True) | |||
| data_set = data_set.map(operations=decode, input_columns=["image"], num_parallel_workers=8) | |||
| augmentor = Augmentor(config.augment_severity, config.augment_prob) | |||
| operation = augmentor.process | |||
| ds = ds.map(operations=operation, input_columns=["image"], | |||
| num_parallel_workers=1, python_multiprocessing=True) | |||
| data_set = data_set.map(operations=operation, input_columns=["image"], | |||
| num_parallel_workers=1, python_multiprocessing=True) | |||
| ##randomly augment half of samples to be negative samples | |||
| ds = ds.map(operations=[random_neg_with_rotate, unify_img_label, transform_image], input_columns=["image", "label"], | |||
| num_parallel_workers=8, python_multiprocessing=True) | |||
| ##for training double the dataset to accoun for positive and negative | |||
| ds = ds.repeat(2) | |||
| data_set = data_set.map(operations=[random_neg_with_rotate, unify_img_label, transform_image], | |||
| input_columns=["image", "label"], | |||
| num_parallel_workers=8, python_multiprocessing=True) | |||
| ##for training double the data_set to accoun for positive and negative | |||
| data_set = data_set.repeat(2) | |||
| # apply batch operations | |||
| ds = ds.batch(config.batch_size, drop_remainder=True) | |||
| return ds | |||
| data_set = data_set.batch(config.batch_size, drop_remainder=True) | |||
| return data_set | |||
| def resize_image(img, label): | |||
| @@ -230,17 +232,18 @@ def create_dataset_eval(mindrecord_file_pos, config): | |||
| rank_id = int(os.getenv("RANK_ID", '0')) | |||
| decode = C.Decode() | |||
| ds = de.MindDataset(mindrecord_file_pos, columns_list=["image", "label"], num_parallel_workers=1, | |||
| num_shards=rank_size, shard_id=rank_id, shuffle=False) | |||
| ds = ds.map(operations=decode, input_columns=["image"], num_parallel_workers=8) | |||
| data_set = ds.MindDataset(mindrecord_file_pos, columns_list=["image", "label"], num_parallel_workers=1, | |||
| num_shards=rank_size, shard_id=rank_id, shuffle=False) | |||
| data_set = data_set.map(operations=decode, input_columns=["image"], num_parallel_workers=8) | |||
| global image_height | |||
| global image_width | |||
| image_height = config.im_size_h | |||
| image_width = config.im_size_w | |||
| ds = ds.map(operations=resize_image, input_columns=["image", "label"], num_parallel_workers=config.work_nums, | |||
| python_multiprocessing=False) | |||
| data_set = data_set.map(operations=resize_image, input_columns=["image", "label"], | |||
| num_parallel_workers=config.work_nums, | |||
| python_multiprocessing=False) | |||
| # apply batch operations | |||
| ds = ds.batch(1, drop_remainder=True) | |||
| data_set = data_set.batch(1, drop_remainder=True) | |||
| return ds | |||
| return data_set | |||
| @@ -16,7 +16,7 @@ | |||
| import os | |||
| import numpy as np | |||
| import mindspore.common.dtype as mstype | |||
| import mindspore.dataset.engine as de | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.transforms.c_transforms as C | |||
| import mindspore.dataset.vision.c_transforms as vc | |||
| from PIL import Image, ImageFile | |||
| @@ -105,7 +105,7 @@ def create_dataset(name, dataset_path, batch_size=1, num_shards=1, shard_id=0, i | |||
| dataset = IIIT5KDataset(dataset_path, "annotation.txt", config) | |||
| else: | |||
| raise ValueError(f"unsupported dataset name: {name}") | |||
| ds = de.GeneratorDataset(dataset, ["image", "label"], shuffle=True, num_shards=num_shards, shard_id=shard_id) | |||
| data_set = ds.GeneratorDataset(dataset, ["image", "label"], shuffle=True, num_shards=num_shards, shard_id=shard_id) | |||
| image_trans = [ | |||
| vc.Resize((config.image_height, config.image_width)), | |||
| vc.Normalize([127.5, 127.5, 127.5], std=[127.5, 127.5, 127.5]), | |||
| @@ -114,8 +114,8 @@ def create_dataset(name, dataset_path, batch_size=1, num_shards=1, shard_id=0, i | |||
| label_trans = [ | |||
| C.TypeCast(mstype.int32) | |||
| ] | |||
| ds = ds.map(operations=image_trans, input_columns=["image"], num_parallel_workers=8) | |||
| ds = ds.map(operations=label_trans, input_columns=["label"], num_parallel_workers=8) | |||
| data_set = data_set.map(operations=image_trans, input_columns=["image"], num_parallel_workers=8) | |||
| data_set = data_set.map(operations=label_trans, input_columns=["label"], num_parallel_workers=8) | |||
| ds = ds.batch(batch_size, drop_remainder=True) | |||
| return ds | |||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||
| return data_set | |||
| @@ -16,7 +16,7 @@ | |||
| Data operations, will be used in train.py and eval.py | |||
| """ | |||
| import mindspore.common.dtype as mstype | |||
| import mindspore.dataset.engine as de | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.transforms.c_transforms as C2 | |||
| import mindspore.dataset.vision.c_transforms as C | |||
| from src.config import config_gpu as cfg | |||
| @@ -37,33 +37,33 @@ def create_dataset(dataset_path, do_train, rank, group_size, repeat_num=1): | |||
| dataset | |||
| """ | |||
| if group_size == 1: | |||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=cfg.work_nums, shuffle=True) | |||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=cfg.work_nums, shuffle=True) | |||
| else: | |||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=cfg.work_nums, shuffle=True, | |||
| num_shards=group_size, shard_id=rank) | |||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=cfg.work_nums, shuffle=True, | |||
| num_shards=group_size, shard_id=rank) | |||
| # define map operations | |||
| if do_train: | |||
| trans = [ | |||
| C.RandomCropDecodeResize(299, scale=(0.08, 1.0), ratio=(0.75, 1.333)), | |||
| C.RandomHorizontalFlip(prob=0.5), | |||
| C.RandomColorAdjust(brightness=0.4, contrast=0.4, saturation=0.4) | |||
| ] | |||
| ] | |||
| else: | |||
| trans = [ | |||
| C.Decode(), | |||
| C.Resize(299), | |||
| C.CenterCrop(299) | |||
| ] | |||
| ] | |||
| trans += [ | |||
| C.Rescale(1.0 / 255.0, 0.0), | |||
| C.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), | |||
| C.HWC2CHW() | |||
| ] | |||
| type_cast_op = C2.TypeCast(mstype.int32) | |||
| ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=cfg.work_nums) | |||
| ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=cfg.work_nums) | |||
| data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=cfg.work_nums) | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=cfg.work_nums) | |||
| # apply batch operations | |||
| ds = ds.batch(cfg.batch_size, drop_remainder=True) | |||
| data_set = data_set.batch(cfg.batch_size, drop_remainder=True) | |||
| # apply dataset repeat operation | |||
| ds = ds.repeat(repeat_num) | |||
| return ds | |||
| data_set = data_set.repeat(repeat_num) | |||
| return data_set | |||
| @@ -17,7 +17,7 @@ create train or eval dataset. | |||
| """ | |||
| import os | |||
| import mindspore.common.dtype as mstype | |||
| import mindspore.dataset.engine as de | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.vision.c_transforms as C | |||
| import mindspore.dataset.transforms.c_transforms as C2 | |||
| from mindspore.communication.management import init, get_rank, get_group_size | |||
| @@ -44,10 +44,10 @@ def create_dataset1(dataset_path, do_train, repeat_num=1, batch_size=32, target= | |||
| device_num = get_group_size() | |||
| if device_num == 1: | |||
| ds = de.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||
| data_set = ds.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||
| else: | |||
| ds = de.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||
| num_shards=device_num, shard_id=rank_id) | |||
| data_set = ds.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||
| num_shards=device_num, shard_id=rank_id) | |||
| # define map operations | |||
| trans = [] | |||
| @@ -66,15 +66,15 @@ def create_dataset1(dataset_path, do_train, repeat_num=1, batch_size=32, target= | |||
| type_cast_op = C2.TypeCast(mstype.int32) | |||
| ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) | |||
| ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8) | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) | |||
| data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8) | |||
| # apply batch operations | |||
| ds = ds.batch(batch_size, drop_remainder=True) | |||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||
| # apply dataset repeat operation | |||
| ds = ds.repeat(repeat_num) | |||
| data_set = data_set.repeat(repeat_num) | |||
| return ds | |||
| return data_set | |||
| def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend"): | |||
| @@ -99,10 +99,10 @@ def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, target= | |||
| device_num = get_group_size() | |||
| if device_num == 1: | |||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||
| else: | |||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||
| num_shards=device_num, shard_id=rank_id) | |||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||
| num_shards=device_num, shard_id=rank_id) | |||
| image_size = 224 | |||
| mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] | |||
| @@ -127,16 +127,16 @@ def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, target= | |||
| type_cast_op = C2.TypeCast(mstype.int32) | |||
| ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8) | |||
| ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) | |||
| data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8) | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) | |||
| # apply batch operations | |||
| ds = ds.batch(batch_size, drop_remainder=True) | |||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||
| # apply dataset repeat operation | |||
| ds = ds.repeat(repeat_num) | |||
| data_set = data_set.repeat(repeat_num) | |||
| return ds | |||
| return data_set | |||
| def _get_rank_info(): | |||
| @@ -21,7 +21,7 @@ import numpy as np | |||
| from mindspore import Tensor | |||
| from mindspore.train.model import Model | |||
| import mindspore.common.dtype as mstype | |||
| import mindspore.dataset.engine as de | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.vision.c_transforms as C | |||
| import mindspore.dataset.transforms.c_transforms as C2 | |||
| @@ -43,22 +43,22 @@ def create_dataset(dataset_path, do_train, config, repeat_num=1): | |||
| rank_size = int(os.getenv("RANK_SIZE", '1')) | |||
| rank_id = int(os.getenv("RANK_ID", '0')) | |||
| if rank_size == 1: | |||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||
| else: | |||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||
| num_shards=rank_size, shard_id=rank_id) | |||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||
| num_shards=rank_size, shard_id=rank_id) | |||
| elif config.platform == "GPU": | |||
| if do_train: | |||
| if config.run_distribute: | |||
| from mindspore.communication.management import get_rank, get_group_size | |||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||
| num_shards=get_group_size(), shard_id=get_rank()) | |||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||
| num_shards=get_group_size(), shard_id=get_rank()) | |||
| else: | |||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||
| else: | |||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||
| elif config.platform == "CPU": | |||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||
| resize_height = config.image_height | |||
| resize_width = config.image_width | |||
| @@ -83,19 +83,19 @@ def create_dataset(dataset_path, do_train, config, repeat_num=1): | |||
| type_cast_op = C2.TypeCast(mstype.int32) | |||
| ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8) | |||
| ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) | |||
| data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8) | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) | |||
| # apply shuffle operations | |||
| ds = ds.shuffle(buffer_size=buffer_size) | |||
| data_set = data_set.shuffle(buffer_size=buffer_size) | |||
| # apply batch operations | |||
| ds = ds.batch(config.batch_size, drop_remainder=True) | |||
| data_set = data_set.batch(config.batch_size, drop_remainder=True) | |||
| # apply dataset repeat operation | |||
| ds = ds.repeat(repeat_num) | |||
| data_set = data_set.repeat(repeat_num) | |||
| return ds | |||
| return data_set | |||
| def extract_features(net, dataset_path, config): | |||
| @@ -121,5 +121,5 @@ def extract_features(net, dataset_path, config): | |||
| features = model.predict(Tensor(image)) | |||
| np.save(features_path, features.asnumpy()) | |||
| np.save(label_path, label) | |||
| print(f"Complete the batch {i+1}/{step_size}") | |||
| print(f"Complete the batch {i + 1}/{step_size}") | |||
| return step_size | |||
| @@ -18,7 +18,7 @@ create train or eval dataset. | |||
| import os | |||
| from functools import partial | |||
| import mindspore.common.dtype as mstype | |||
| import mindspore.dataset.engine as de | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.vision.c_transforms as C | |||
| import mindspore.dataset.transforms.c_transforms as C2 | |||
| import mindspore.dataset.transforms.py_transforms as P2 | |||
| @@ -43,24 +43,24 @@ def create_dataset(dataset_path, do_train, config, device_target, repeat_num=1, | |||
| rank_id = int(os.getenv("RANK_ID")) | |||
| columns_list = ['image', 'label'] | |||
| if config.data_load_mode == "mindrecord": | |||
| load_func = partial(de.MindDataset, dataset_path, columns_list) | |||
| load_func = partial(ds.MindDataset, dataset_path, columns_list) | |||
| else: | |||
| load_func = partial(de.ImageFolderDataset, dataset_path) | |||
| load_func = partial(ds.ImageFolderDataset, dataset_path) | |||
| if do_train: | |||
| if rank_size == 1: | |||
| ds = load_func(num_parallel_workers=8, shuffle=True) | |||
| data_set = load_func(num_parallel_workers=8, shuffle=True) | |||
| else: | |||
| ds = load_func(num_parallel_workers=8, shuffle=True, | |||
| num_shards=rank_size, shard_id=rank_id) | |||
| data_set = load_func(num_parallel_workers=8, shuffle=True, | |||
| num_shards=rank_size, shard_id=rank_id) | |||
| else: | |||
| ds = load_func(num_parallel_workers=8, shuffle=False) | |||
| data_set = load_func(num_parallel_workers=8, shuffle=False) | |||
| elif device_target == "GPU": | |||
| if do_train: | |||
| from mindspore.communication.management import get_rank, get_group_size | |||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||
| num_shards=get_group_size(), shard_id=get_rank()) | |||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||
| num_shards=get_group_size(), shard_id=get_rank()) | |||
| else: | |||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||
| else: | |||
| raise ValueError("Unsupported device_target.") | |||
| @@ -69,7 +69,7 @@ def create_dataset(dataset_path, do_train, config, device_target, repeat_num=1, | |||
| if do_train: | |||
| buffer_size = 20480 | |||
| # apply shuffle operations | |||
| ds = ds.shuffle(buffer_size=buffer_size) | |||
| data_set = data_set.shuffle(buffer_size=buffer_size) | |||
| # define map operations | |||
| decode_op = C.Decode() | |||
| @@ -89,16 +89,16 @@ def create_dataset(dataset_path, do_train, config, device_target, repeat_num=1, | |||
| type_cast_op = C2.TypeCast(mstype.int32) | |||
| ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=16) | |||
| ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) | |||
| data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=16) | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) | |||
| # apply batch operations | |||
| ds = ds.batch(batch_size, drop_remainder=True) | |||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||
| # apply dataset repeat operation | |||
| ds = ds.repeat(repeat_num) | |||
| data_set = data_set.repeat(repeat_num) | |||
| return ds | |||
| return data_set | |||
| def create_dataset_py(dataset_path, do_train, config, device_target, repeat_num=1, batch_size=32): | |||
| @@ -119,12 +119,12 @@ def create_dataset_py(dataset_path, do_train, config, device_target, repeat_num= | |||
| rank_id = int(os.getenv("RANK_ID")) | |||
| if do_train: | |||
| if rank_size == 1: | |||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||
| else: | |||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||
| num_shards=rank_size, shard_id=rank_id) | |||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||
| num_shards=rank_size, shard_id=rank_id) | |||
| else: | |||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=False) | |||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=False) | |||
| else: | |||
| raise ValueError("Unsupported device target.") | |||
| @@ -133,7 +133,7 @@ def create_dataset_py(dataset_path, do_train, config, device_target, repeat_num= | |||
| if do_train: | |||
| buffer_size = 20480 | |||
| # apply shuffle operations | |||
| ds = ds.shuffle(buffer_size=buffer_size) | |||
| data_set = data_set.shuffle(buffer_size=buffer_size) | |||
| # define map operations | |||
| decode_op = P.Decode() | |||
| @@ -152,12 +152,13 @@ def create_dataset_py(dataset_path, do_train, config, device_target, repeat_num= | |||
| compose = P2.Compose(trans) | |||
| ds = ds.map(operations=compose, input_columns="image", num_parallel_workers=8, python_multiprocessing=True) | |||
| data_set = data_set.map(operations=compose, input_columns="image", num_parallel_workers=8, | |||
| python_multiprocessing=True) | |||
| # apply batch operations | |||
| ds = ds.batch(batch_size, drop_remainder=True) | |||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||
| # apply dataset repeat operation | |||
| ds = ds.repeat(repeat_num) | |||
| data_set = data_set.repeat(repeat_num) | |||
| return ds | |||
| return data_set | |||
| @@ -16,7 +16,7 @@ | |||
| create train or eval dataset. | |||
| """ | |||
| import mindspore.common.dtype as mstype | |||
| import mindspore.dataset.engine as de | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.vision.c_transforms as C | |||
| import mindspore.dataset.transforms.c_transforms as C2 | |||
| @@ -38,12 +38,12 @@ def create_dataset(dataset_path, do_train, config, device_target, repeat_num=1, | |||
| if do_train: | |||
| if run_distribute: | |||
| from mindspore.communication.management import get_rank, get_group_size | |||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||
| num_shards=get_group_size(), shard_id=get_rank()) | |||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||
| num_shards=get_group_size(), shard_id=get_rank()) | |||
| else: | |||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||
| else: | |||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||
| else: | |||
| raise ValueError("Unsupported device_target.") | |||
| @@ -70,16 +70,16 @@ def create_dataset(dataset_path, do_train, config, device_target, repeat_num=1, | |||
| type_cast_op = C2.TypeCast(mstype.int32) | |||
| ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8) | |||
| ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) | |||
| data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8) | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) | |||
| # apply shuffle operations | |||
| ds = ds.shuffle(buffer_size=buffer_size) | |||
| data_set = data_set.shuffle(buffer_size=buffer_size) | |||
| # apply batch operations | |||
| ds = ds.batch(batch_size, drop_remainder=True) | |||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||
| # apply dataset repeat operation | |||
| ds = ds.repeat(repeat_num) | |||
| data_set = data_set.repeat(repeat_num) | |||
| return ds | |||
| return data_set | |||
| @@ -16,7 +16,7 @@ | |||
| Data operations, will be used in train.py and eval.py | |||
| """ | |||
| import mindspore.common.dtype as mstype | |||
| import mindspore.dataset.engine as de | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.transforms.c_transforms as C2 | |||
| import mindspore.dataset.vision.c_transforms as C | |||
| @@ -37,10 +37,10 @@ def create_dataset(dataset_path, config, do_train, repeat_num=1): | |||
| rank = config.rank | |||
| group_size = config.group_size | |||
| if group_size == 1: | |||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=config.work_nums, shuffle=True) | |||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=config.work_nums, shuffle=True) | |||
| else: | |||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=config.work_nums, shuffle=True, | |||
| num_shards=group_size, shard_id=rank) | |||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=config.work_nums, shuffle=True, | |||
| num_shards=group_size, shard_id=rank) | |||
| # define map operations | |||
| if do_train: | |||
| trans = [ | |||
| @@ -60,10 +60,10 @@ def create_dataset(dataset_path, config, do_train, repeat_num=1): | |||
| C.HWC2CHW() | |||
| ] | |||
| type_cast_op = C2.TypeCast(mstype.int32) | |||
| ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=config.work_nums) | |||
| ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=config.work_nums) | |||
| data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=config.work_nums) | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=config.work_nums) | |||
| # apply batch operations | |||
| ds = ds.batch(config.batch_size, drop_remainder=True) | |||
| data_set = data_set.batch(config.batch_size, drop_remainder=True) | |||
| # apply dataset repeat operation | |||
| ds = ds.repeat(repeat_num) | |||
| return ds | |||
| data_set = data_set.repeat(repeat_num) | |||
| return data_set | |||
| @@ -25,21 +25,24 @@ import pyclipper | |||
| from PIL import Image | |||
| from src.config import config | |||
| import mindspore.dataset.engine as de | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.vision.py_transforms as py_transforms | |||
| __all__ = ['train_dataset_creator', 'test_dataset_creator'] | |||
| def get_img(img_path): | |||
| img = cv2.imread(img_path) | |||
| img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) | |||
| return img | |||
| def get_imgs_names(root_dir): | |||
| img_paths = [i for i in os.listdir(root_dir) | |||
| if os.path.splitext(i)[-1].lower() in ['.jpg', '.jpeg', '.png']] | |||
| return img_paths | |||
| def get_bboxes(img, gt_path): | |||
| h, w = img.shape[0:2] | |||
| with open(gt_path, 'r', encoding='utf-8-sig') as f: | |||
| @@ -58,6 +61,7 @@ def get_bboxes(img, gt_path): | |||
| tags.append(tag) | |||
| return np.array(bboxes), tags | |||
| def random_scale(img, min_size): | |||
| h, w = img.shape[0:2] | |||
| if max(h, w) > 1280: | |||
| @@ -74,12 +78,14 @@ def random_scale(img, min_size): | |||
| img = cv2.resize(img, dsize=None, fx=scale2, fy=scale2) | |||
| return img | |||
| def random_horizontal_flip(imgs): | |||
| if random.random() < 0.5: | |||
| for i, _ in enumerate(imgs): | |||
| imgs[i] = np.flip(imgs[i], axis=1).copy() | |||
| return imgs | |||
| def random_rotate(imgs): | |||
| max_angle = 10 | |||
| angle = random.random() * 2 * max_angle - max_angle | |||
| @@ -91,6 +97,7 @@ def random_rotate(imgs): | |||
| imgs[i] = img_rotation | |||
| return imgs | |||
| def random_crop(imgs, img_size): | |||
| h, w = imgs[0].shape[0:2] | |||
| th, tw = img_size | |||
| @@ -118,21 +125,25 @@ def random_crop(imgs, img_size): | |||
| imgs[idx] = imgs[idx][i:i + th, j:j + tw] | |||
| return imgs | |||
| def scale(img, long_size=2240): | |||
| h, w = img.shape[0:2] | |||
| scale_long = long_size * 1.0 / max(h, w) | |||
| img = cv2.resize(img, dsize=None, fx=scale_long, fy=scale_long) | |||
| return img | |||
| def dist(a, b): | |||
| return np.sqrt(np.sum((a - b) ** 2)) | |||
| def perimeter(bbox): | |||
| peri = 0.0 | |||
| for i in range(bbox.shape[0]): | |||
| peri += dist(bbox[i], bbox[(i + 1) % bbox.shape[0]]) | |||
| return peri | |||
| def shrink(bboxes, rate, max_shr=20): | |||
| rate = rate * rate | |||
| shrinked_bboxes = [] | |||
| @@ -158,6 +169,7 @@ def shrink(bboxes, rate, max_shr=20): | |||
| return np.array(shrinked_bboxes) | |||
| class TrainDataset: | |||
| def __init__(self): | |||
| self.is_transform = True | |||
| @@ -260,6 +272,7 @@ class TrainDataset: | |||
| def __len__(self): | |||
| return len(self.all_img_paths) | |||
| def IC15_TEST_Generator(): | |||
| ic15_test_data_dir = config.TEST_ROOT_DIR + 'ch4_test_images/' | |||
| img_size = config.INFER_LONG_SIZE | |||
| @@ -298,6 +311,7 @@ def IC15_TEST_Generator(): | |||
| yield img, img_resized, img_name | |||
| class DistributedSampler(): | |||
| def __init__(self, dataset, rank, group_size, shuffle=True, seed=0): | |||
| self.dataset = dataset | |||
| @@ -324,18 +338,20 @@ class DistributedSampler(): | |||
| def __len__(self): | |||
| return self.num_samplers | |||
| def train_dataset_creator(rank, group_size, shuffle=True): | |||
| cv2.setNumThreads(0) | |||
| dataset = TrainDataset() | |||
| sampler = DistributedSampler(dataset, rank, group_size, shuffle) | |||
| ds = de.GeneratorDataset(dataset, ['img', 'gt_text', 'gt_kernels', 'training_mask'], num_parallel_workers=8, | |||
| sampler=sampler) | |||
| ds = ds.repeat(1) | |||
| ds = ds.batch(config.TRAIN_BATCH_SIZE, drop_remainder=config.TRAIN_DROP_REMAINDER) | |||
| return ds | |||
| data_set = ds.GeneratorDataset(dataset, ['img', 'gt_text', 'gt_kernels', 'training_mask'], num_parallel_workers=8, | |||
| sampler=sampler) | |||
| data_set = data_set.repeat(1) | |||
| data_set = data_set.batch(config.TRAIN_BATCH_SIZE, drop_remainder=config.TRAIN_DROP_REMAINDER) | |||
| return data_set | |||
| def test_dataset_creator(): | |||
| ds = de.GeneratorDataset(IC15_TEST_Generator, ['img', 'img_resized', 'img_name']) | |||
| ds = ds.shuffle(config.TEST_BUFFER_SIZE) | |||
| ds = ds.batch(1, drop_remainder=config.TEST_DROP_REMAINDER) | |||
| return ds | |||
| data_set = ds.GeneratorDataset(IC15_TEST_Generator, ['img', 'img_resized', 'img_name']) | |||
| data_set = data_set.shuffle(config.TEST_BUFFER_SIZE) | |||
| data_set = data_set.batch(1, drop_remainder=config.TEST_DROP_REMAINDER) | |||
| return data_set | |||
| @@ -29,7 +29,7 @@ from mindspore.train.serialization import load_checkpoint, load_param_into_net | |||
| from mindspore.common import set_seed | |||
| import mindspore.nn as nn | |||
| import mindspore.common.initializer as weight_init | |||
| import mindspore.dataset.engine as de | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.vision.c_transforms as C | |||
| from src.resnet_gpu_benchmark import resnet50 as resnet | |||
| from src.CrossEntropySmooth import CrossEntropySmooth | |||
| @@ -45,19 +45,22 @@ parser.add_argument('--dataset_path', type=str, default=None, help='Imagenet dat | |||
| parser.add_argument('--ckpt_path', type=str, default="./", help='The path to save ckpt if save_ckpt is True;\ | |||
| Or the ckpt model file when eval is True') | |||
| parser.add_argument('--mode', type=str, default="GRAPH", choices=["GRAPH", "PYNATIVE"], help='Execute mode') | |||
| parser.add_argument('--dtype', type=str, choices=["fp32", "fp16", "FP16", "FP32"], default="fp16",\ | |||
| help='Compute data type fp32 or fp16: default fp16') | |||
| parser.add_argument('--dtype', type=str, choices=["fp32", "fp16", "FP16", "FP32"], default="fp16", \ | |||
| help='Compute data type fp32 or fp16: default fp16') | |||
| args_opt = parser.parse_args() | |||
| set_seed(1) | |||
| class MyTimeMonitor(Callback): | |||
| def __init__(self, batch_size, sink_size): | |||
| super(MyTimeMonitor, self).__init__() | |||
| self.batch_size = batch_size | |||
| self.size = sink_size | |||
| def step_begin(self, run_context): | |||
| self.step_time = time.time() | |||
| def step_end(self, run_context): | |||
| cb_params = run_context.original_args() | |||
| loss = cb_params.net_outputs | |||
| @@ -75,17 +78,18 @@ class MyTimeMonitor(Callback): | |||
| raise ValueError("epoch: {} step: {}. Invalid loss, terminating training.".format( | |||
| cb_params.cur_epoch_num, cur_step_in_epoch)) | |||
| step_mseconds = (time.time() - self.step_time) * 1000 | |||
| fps = self.batch_size / step_mseconds *1000 * self.size | |||
| fps = self.batch_size / step_mseconds * 1000 * self.size | |||
| print("epoch: %s step: %s, loss is %s" % (cb_params.cur_epoch_num, cur_step_in_epoch, loss), | |||
| "Epoch time: {:5.3f} ms, fps: {:d} img/sec.".format(step_mseconds, int(fps)), flush=True) | |||
| def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target="GPU", dtype="fp16", | |||
| device_num=1): | |||
| if device_num == 1: | |||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=4, shuffle=True) | |||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=4, shuffle=True) | |||
| else: | |||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=4, shuffle=True, | |||
| num_shards=device_num, shard_id=get_rank()) | |||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=4, shuffle=True, | |||
| num_shards=device_num, shard_id=get_rank()) | |||
| image_size = 224 | |||
| mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] | |||
| std = [0.229 * 255, 0.224 * 255, 0.225 * 255] | |||
| @@ -113,14 +117,15 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target=" | |||
| ] | |||
| if dtype == "fp32": | |||
| trans.append(C.HWC2CHW()) | |||
| ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8) | |||
| data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8) | |||
| # apply batch operations | |||
| ds = ds.batch(batch_size, drop_remainder=True) | |||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||
| # apply dataset repeat operation | |||
| if repeat_num > 1: | |||
| ds = ds.repeat(repeat_num) | |||
| data_set = data_set.repeat(repeat_num) | |||
| return data_set | |||
| return ds | |||
| def get_liner_lr(lr_init, lr_end, lr_max, warmup_epochs, total_epochs, steps_per_epoch): | |||
| lr_each_step = [] | |||
| @@ -136,6 +141,7 @@ def get_liner_lr(lr_init, lr_end, lr_max, warmup_epochs, total_epochs, steps_per | |||
| lr_each_step = np.array(lr_each_step).astype(np.float32) | |||
| return lr_each_step | |||
| def train(): | |||
| # set args | |||
| dev = "GPU" | |||
| @@ -221,6 +227,7 @@ def train(): | |||
| else: | |||
| model.train(epoch_size, dataset, callbacks=cb) | |||
| def eval_(): | |||
| # set args | |||
| dev = "GPU" | |||
| @@ -251,6 +258,7 @@ def eval_(): | |||
| res = model.eval(dataset) | |||
| print("result:", res, "ckpt=", ckpt_dir) | |||
| if __name__ == '__main__': | |||
| if not args_opt.eval: | |||
| train() | |||
| @@ -17,7 +17,7 @@ create train or eval dataset. | |||
| """ | |||
| import os | |||
| import mindspore.common.dtype as mstype | |||
| import mindspore.dataset.engine as de | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.vision.c_transforms as C | |||
| import mindspore.dataset.transforms.c_transforms as C2 | |||
| from mindspore.communication.management import init, get_rank, get_group_size | |||
| @@ -47,10 +47,10 @@ def create_dataset1(dataset_path, do_train, repeat_num=1, batch_size=32, target= | |||
| else: | |||
| device_num = 1 | |||
| if device_num == 1: | |||
| ds = de.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||
| data_set = ds.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||
| else: | |||
| ds = de.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||
| num_shards=device_num, shard_id=rank_id) | |||
| data_set = ds.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||
| num_shards=device_num, shard_id=rank_id) | |||
| # define map operations | |||
| trans = [] | |||
| @@ -69,15 +69,15 @@ def create_dataset1(dataset_path, do_train, repeat_num=1, batch_size=32, target= | |||
| type_cast_op = C2.TypeCast(mstype.int32) | |||
| ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) | |||
| ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8) | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) | |||
| data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8) | |||
| # apply batch operations | |||
| ds = ds.batch(batch_size, drop_remainder=True) | |||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||
| # apply dataset repeat operation | |||
| ds = ds.repeat(repeat_num) | |||
| data_set = data_set.repeat(repeat_num) | |||
| return ds | |||
| return data_set | |||
| def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend", distribute=False): | |||
| @@ -106,10 +106,10 @@ def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, target= | |||
| device_num = 1 | |||
| if device_num == 1: | |||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||
| else: | |||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||
| num_shards=device_num, shard_id=rank_id) | |||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||
| num_shards=device_num, shard_id=rank_id) | |||
| image_size = 224 | |||
| mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] | |||
| @@ -134,16 +134,16 @@ def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, target= | |||
| type_cast_op = C2.TypeCast(mstype.int32) | |||
| ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8) | |||
| ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) | |||
| data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8) | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) | |||
| # apply batch operations | |||
| ds = ds.batch(batch_size, drop_remainder=True) | |||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||
| # apply dataset repeat operation | |||
| ds = ds.repeat(repeat_num) | |||
| data_set = data_set.repeat(repeat_num) | |||
| return ds | |||
| return data_set | |||
| def create_dataset3(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend", distribute=False): | |||
| @@ -171,10 +171,10 @@ def create_dataset3(dataset_path, do_train, repeat_num=1, batch_size=32, target= | |||
| device_num = 1 | |||
| rank_id = 1 | |||
| if device_num == 1: | |||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||
| else: | |||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||
| num_shards=device_num, shard_id=rank_id) | |||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||
| num_shards=device_num, shard_id=rank_id) | |||
| image_size = 224 | |||
| mean = [0.475 * 255, 0.451 * 255, 0.392 * 255] | |||
| std = [0.275 * 255, 0.267 * 255, 0.278 * 255] | |||
| @@ -198,15 +198,15 @@ def create_dataset3(dataset_path, do_train, repeat_num=1, batch_size=32, target= | |||
| type_cast_op = C2.TypeCast(mstype.int32) | |||
| ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8) | |||
| ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) | |||
| data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8) | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) | |||
| # apply batch operations | |||
| ds = ds.batch(batch_size, drop_remainder=True) | |||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||
| # apply dataset repeat operation | |||
| ds = ds.repeat(repeat_num) | |||
| data_set = data_set.repeat(repeat_num) | |||
| return ds | |||
| return data_set | |||
| def create_dataset4(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend", distribute=False): | |||
| @@ -234,10 +234,10 @@ def create_dataset4(dataset_path, do_train, repeat_num=1, batch_size=32, target= | |||
| else: | |||
| device_num = 1 | |||
| if device_num == 1: | |||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=12, shuffle=True) | |||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=12, shuffle=True) | |||
| else: | |||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=12, shuffle=True, | |||
| num_shards=device_num, shard_id=rank_id) | |||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=12, shuffle=True, | |||
| num_shards=device_num, shard_id=rank_id) | |||
| image_size = 224 | |||
| mean = [123.68, 116.78, 103.94] | |||
| std = [1.0, 1.0, 1.0] | |||
| @@ -260,16 +260,16 @@ def create_dataset4(dataset_path, do_train, repeat_num=1, batch_size=32, target= | |||
| ] | |||
| type_cast_op = C2.TypeCast(mstype.int32) | |||
| ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=12) | |||
| ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=12) | |||
| data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=12) | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=12) | |||
| # apply batch operations | |||
| ds = ds.batch(batch_size, drop_remainder=True) | |||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||
| # apply dataset repeat operation | |||
| ds = ds.repeat(repeat_num) | |||
| data_set = data_set.repeat(repeat_num) | |||
| return ds | |||
| return data_set | |||
| def _get_rank_info(): | |||
| @@ -18,7 +18,7 @@ create train or eval dataset. | |||
| import os | |||
| from functools import partial | |||
| import mindspore.common.dtype as mstype | |||
| import mindspore.dataset.engine as de | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.transforms.c_transforms as C2 | |||
| import mindspore.dataset.vision.c_transforms as C | |||
| import mindspore.dataset.transforms.py_transforms as P2 | |||
| @@ -53,14 +53,14 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target=" | |||
| columns_list = ['image', 'label'] | |||
| if config.data_load_mode == "mindrecord": | |||
| load_func = partial(de.MindDataset, dataset_path, columns_list) | |||
| load_func = partial(ds.MindDataset, dataset_path, columns_list) | |||
| else: | |||
| load_func = partial(de.ImageFolderDataset, dataset_path) | |||
| load_func = partial(ds.ImageFolderDataset, dataset_path) | |||
| if device_num == 1: | |||
| ds = load_func(num_parallel_workers=8, shuffle=True) | |||
| data_set = load_func(num_parallel_workers=8, shuffle=True) | |||
| else: | |||
| ds = load_func(num_parallel_workers=8, shuffle=True, | |||
| num_shards=device_num, shard_id=rank_id) | |||
| data_set = load_func(num_parallel_workers=8, shuffle=True, | |||
| num_shards=device_num, shard_id=rank_id) | |||
| image_size = 224 | |||
| mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] | |||
| @@ -85,16 +85,16 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target=" | |||
| type_cast_op = C2.TypeCast(mstype.int32) | |||
| ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8) | |||
| ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) | |||
| data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8) | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) | |||
| # apply batch operations | |||
| ds = ds.batch(batch_size, drop_remainder=True) | |||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||
| # apply dataset repeat operation | |||
| ds = ds.repeat(repeat_num) | |||
| data_set = data_set.repeat(repeat_num) | |||
| return ds | |||
| return data_set | |||
| def create_dataset_py(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend"): | |||
| @@ -121,12 +121,12 @@ def create_dataset_py(dataset_path, do_train, repeat_num=1, batch_size=32, targe | |||
| if do_train: | |||
| if device_num == 1: | |||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||
| else: | |||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||
| num_shards=device_num, shard_id=rank_id) | |||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||
| num_shards=device_num, shard_id=rank_id) | |||
| else: | |||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=False) | |||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=False) | |||
| image_size = 224 | |||
| @@ -147,12 +147,13 @@ def create_dataset_py(dataset_path, do_train, repeat_num=1, batch_size=32, targe | |||
| trans = [decode_op, resize_op, center_crop, to_tensor, normalize_op] | |||
| compose = P2.Compose(trans) | |||
| ds = ds.map(operations=compose, input_columns="image", num_parallel_workers=8, python_multiprocessing=True) | |||
| data_set = data_set.map(operations=compose, input_columns="image", num_parallel_workers=8, | |||
| python_multiprocessing=True) | |||
| # apply batch operations | |||
| ds = ds.batch(batch_size, drop_remainder=True) | |||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||
| # apply dataset repeat operation | |||
| ds = ds.repeat(repeat_num) | |||
| data_set = data_set.repeat(repeat_num) | |||
| return ds | |||
| return data_set | |||
| @@ -17,7 +17,7 @@ create train or eval dataset. | |||
| """ | |||
| import os | |||
| import mindspore.common.dtype as mstype | |||
| import mindspore.dataset.engine as de | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.vision.c_transforms as C | |||
| import mindspore.dataset.transforms.c_transforms as C2 | |||
| from mindspore.communication.management import init, get_rank, get_group_size | |||
| @@ -47,10 +47,10 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target=" | |||
| num_parallels = 4 | |||
| if device_num == 1: | |||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=num_parallels, shuffle=True) | |||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=num_parallels, shuffle=True) | |||
| else: | |||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=num_parallels, shuffle=True, | |||
| num_shards=device_num, shard_id=rank_id) | |||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=num_parallels, shuffle=True, | |||
| num_shards=device_num, shard_id=rank_id) | |||
| image_size = 224 | |||
| mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] | |||
| @@ -75,16 +75,16 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target=" | |||
| type_cast_op = C2.TypeCast(mstype.int32) | |||
| ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=num_parallels) | |||
| ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=num_parallels) | |||
| data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=num_parallels) | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=num_parallels) | |||
| # apply batch operations | |||
| ds = ds.batch(batch_size, drop_remainder=True) | |||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||
| # apply dataset repeat operation | |||
| ds = ds.repeat(repeat_num) | |||
| data_set = data_set.repeat(repeat_num) | |||
| return ds | |||
| return data_set | |||
| def _get_rank_info(): | |||
| @@ -15,7 +15,7 @@ | |||
| """Data operations, will be used in train.py and eval.py""" | |||
| from src.config import config | |||
| import mindspore.common.dtype as mstype | |||
| import mindspore.dataset.engine as de | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.transforms.c_transforms as C2 | |||
| import mindspore.dataset.vision.c_transforms as C | |||
| @@ -36,10 +36,10 @@ def create_dataset(dataset_path, do_train, device_num=1, rank=0): | |||
| """ | |||
| if device_num == 1: | |||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||
| else: | |||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||
| num_shards=device_num, shard_id=rank) | |||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||
| num_shards=device_num, shard_id=rank) | |||
| # define map operations | |||
| if do_train: | |||
| trans = [ | |||
| @@ -59,8 +59,8 @@ def create_dataset(dataset_path, do_train, device_num=1, rank=0): | |||
| ] | |||
| type_cast_op = C2.TypeCast(mstype.int32) | |||
| ds = ds.map(input_columns="image", operations=trans, num_parallel_workers=8) | |||
| ds = ds.map(input_columns="label", operations=type_cast_op, num_parallel_workers=8) | |||
| data_set = data_set.map(input_columns="image", operations=trans, num_parallel_workers=8) | |||
| data_set = data_set.map(input_columns="label", operations=type_cast_op, num_parallel_workers=8) | |||
| # apply batch operations | |||
| ds = ds.batch(config.batch_size, drop_remainder=True) | |||
| return ds | |||
| data_set = data_set.batch(config.batch_size, drop_remainder=True) | |||
| return data_set | |||
| @@ -19,7 +19,7 @@ import numpy as np | |||
| from src.config import config_gpu as cfg | |||
| import mindspore.common.dtype as mstype | |||
| import mindspore.dataset.engine as de | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.transforms.c_transforms as C2 | |||
| import mindspore.dataset.vision.c_transforms as C | |||
| @@ -46,10 +46,10 @@ def create_dataset(dataset_path, do_train, rank, group_size, repeat_num=1): | |||
| dataset | |||
| """ | |||
| if group_size == 1: | |||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=cfg.work_nums, shuffle=True) | |||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=cfg.work_nums, shuffle=True) | |||
| else: | |||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=cfg.work_nums, shuffle=True, | |||
| num_shards=group_size, shard_id=rank) | |||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=cfg.work_nums, shuffle=True, | |||
| num_shards=group_size, shard_id=rank) | |||
| # define map operations | |||
| if do_train: | |||
| trans = [ | |||
| @@ -71,9 +71,9 @@ def create_dataset(dataset_path, do_train, rank, group_size, repeat_num=1): | |||
| ] | |||
| type_cast_op = C2.TypeCast(mstype.int32) | |||
| ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=cfg.work_nums) | |||
| ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=cfg.work_nums) | |||
| data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=cfg.work_nums) | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=cfg.work_nums) | |||
| # apply batch operations | |||
| ds = ds.batch(cfg.batch_size, drop_remainder=True) | |||
| data_set = data_set.batch(cfg.batch_size, drop_remainder=True) | |||
| return ds | |||
| return data_set | |||
| @@ -17,7 +17,7 @@ create train or eval dataset. | |||
| """ | |||
| import os | |||
| import mindspore.common.dtype as mstype | |||
| import mindspore.dataset.engine as de | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.vision.c_transforms as C | |||
| import mindspore.dataset.transforms.c_transforms as C2 | |||
| from mindspore.communication.management import init, get_rank, get_group_size | |||
| @@ -48,15 +48,15 @@ def create_dataset_cifar(dataset_path, | |||
| device_num = get_group_size() | |||
| if device_num == 1: | |||
| ds = de.Cifar10Dataset(dataset_path, | |||
| num_parallel_workers=8, | |||
| shuffle=True) | |||
| data_set = ds.Cifar10Dataset(dataset_path, | |||
| num_parallel_workers=8, | |||
| shuffle=True) | |||
| else: | |||
| ds = de.Cifar10Dataset(dataset_path, | |||
| num_parallel_workers=8, | |||
| shuffle=True, | |||
| num_shards=device_num, | |||
| shard_id=rank_id) | |||
| data_set = ds.Cifar10Dataset(dataset_path, | |||
| num_parallel_workers=8, | |||
| shuffle=True, | |||
| num_shards=device_num, | |||
| shard_id=rank_id) | |||
| # define map operations | |||
| if do_train: | |||
| @@ -80,20 +80,20 @@ def create_dataset_cifar(dataset_path, | |||
| type_cast_op = C2.TypeCast(mstype.int32) | |||
| ds = ds.map(operations=type_cast_op, | |||
| input_columns="label", | |||
| num_parallel_workers=8) | |||
| ds = ds.map(operations=trans, | |||
| input_columns="image", | |||
| num_parallel_workers=8) | |||
| data_set = data_set.map(operations=type_cast_op, | |||
| input_columns="label", | |||
| num_parallel_workers=8) | |||
| data_set = data_set.map(operations=trans, | |||
| input_columns="image", | |||
| num_parallel_workers=8) | |||
| # apply batch operations | |||
| ds = ds.batch(batch_size, drop_remainder=True) | |||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||
| # apply dataset repeat operation | |||
| ds = ds.repeat(repeat_num) | |||
| data_set = data_set.repeat(repeat_num) | |||
| return ds | |||
| return data_set | |||
| def create_dataset_imagenet(dataset_path, | |||
| @@ -122,15 +122,15 @@ def create_dataset_imagenet(dataset_path, | |||
| device_num = get_group_size() | |||
| if device_num == 1: | |||
| ds = de.ImageFolderDataset(dataset_path, | |||
| num_parallel_workers=8, | |||
| shuffle=True) | |||
| data_set = ds.ImageFolderDataset(dataset_path, | |||
| num_parallel_workers=8, | |||
| shuffle=True) | |||
| else: | |||
| ds = de.ImageFolderDataset(dataset_path, | |||
| num_parallel_workers=8, | |||
| shuffle=True, | |||
| num_shards=device_num, | |||
| shard_id=rank_id) | |||
| data_set = ds.ImageFolderDataset(dataset_path, | |||
| num_parallel_workers=8, | |||
| shuffle=True, | |||
| num_shards=device_num, | |||
| shard_id=rank_id) | |||
| image_size = 227 | |||
| mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] | |||
| @@ -159,20 +159,20 @@ def create_dataset_imagenet(dataset_path, | |||
| type_cast_op = C2.TypeCast(mstype.int32) | |||
| ds = ds.map(operations=type_cast_op, | |||
| input_columns="label", | |||
| num_parallel_workers=8) | |||
| ds = ds.map(operations=trans, | |||
| input_columns="image", | |||
| num_parallel_workers=8) | |||
| data_set = data_set.map(operations=type_cast_op, | |||
| input_columns="label", | |||
| num_parallel_workers=8) | |||
| data_set = data_set.map(operations=trans, | |||
| input_columns="image", | |||
| num_parallel_workers=8) | |||
| # apply batch operations | |||
| ds = ds.batch(batch_size, drop_remainder=True) | |||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||
| # apply dataset repeat operation | |||
| ds = ds.repeat(repeat_num) | |||
| data_set = data_set.repeat(repeat_num) | |||
| return ds | |||
| return data_set | |||
| def _get_rank_info(): | |||
| @@ -17,7 +17,7 @@ import os | |||
| import math as m | |||
| import numpy as np | |||
| import mindspore.common.dtype as mstype | |||
| import mindspore.dataset.engine as de | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.transforms.c_transforms as c | |||
| import mindspore.dataset.vision.c_transforms as vc | |||
| from PIL import Image | |||
| @@ -86,7 +86,7 @@ def create_dataset(dataset_path, batch_size=1, num_shards=1, shard_id=0, device_ | |||
| """ | |||
| dataset = _CaptchaDataset(dataset_path, cf.max_captcha_digits, device_target) | |||
| ds = de.GeneratorDataset(dataset, ["image", "label"], shuffle=True, num_shards=num_shards, shard_id=shard_id) | |||
| data_set = ds.GeneratorDataset(dataset, ["image", "label"], shuffle=True, num_shards=num_shards, shard_id=shard_id) | |||
| image_trans = [ | |||
| vc.Rescale(1.0 / 255.0, 0.0), | |||
| vc.Normalize([0.9010, 0.9049, 0.9025], std=[0.1521, 0.1347, 0.1458]), | |||
| @@ -96,12 +96,12 @@ def create_dataset(dataset_path, batch_size=1, num_shards=1, shard_id=0, device_ | |||
| label_trans = [ | |||
| c.TypeCast(mstype.int32) | |||
| ] | |||
| ds = ds.map(operations=image_trans, input_columns=["image"], num_parallel_workers=8) | |||
| data_set = data_set.map(operations=image_trans, input_columns=["image"], num_parallel_workers=8) | |||
| if device_target == 'Ascend': | |||
| ds = ds.map(operations=transpose_hwc2whc, input_columns=["image"], num_parallel_workers=8) | |||
| data_set = data_set.map(operations=transpose_hwc2whc, input_columns=["image"], num_parallel_workers=8) | |||
| else: | |||
| ds = ds.map(operations=transpose_hwc2chw, input_columns=["image"], num_parallel_workers=8) | |||
| ds = ds.map(operations=label_trans, input_columns=["label"], num_parallel_workers=8) | |||
| data_set = data_set.map(operations=transpose_hwc2chw, input_columns=["image"], num_parallel_workers=8) | |||
| data_set = data_set.map(operations=label_trans, input_columns=["label"], num_parallel_workers=8) | |||
| ds = ds.batch(batch_size, drop_remainder=True) | |||
| return ds | |||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||
| return data_set | |||
| @@ -16,10 +16,11 @@ | |||
| Data operations, will be used in train.py and eval.py | |||
| """ | |||
| import mindspore.common.dtype as mstype | |||
| import mindspore.dataset.engine as de | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.transforms.c_transforms as C2 | |||
| import mindspore.dataset.vision.c_transforms as C | |||
| def create_dataset(dataset_path, do_train, batch_size=16, device_num=1, rank=0): | |||
| """ | |||
| create a train or eval dataset | |||
| @@ -35,10 +36,10 @@ def create_dataset(dataset_path, do_train, batch_size=16, device_num=1, rank=0): | |||
| dataset | |||
| """ | |||
| if device_num == 1: | |||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||
| else: | |||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||
| num_shards=device_num, shard_id=rank) | |||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||
| num_shards=device_num, shard_id=rank) | |||
| # define map operations | |||
| if do_train: | |||
| trans = [ | |||
| @@ -59,8 +60,8 @@ def create_dataset(dataset_path, do_train, batch_size=16, device_num=1, rank=0): | |||
| ] | |||
| type_cast_op = C2.TypeCast(mstype.int32) | |||
| ds = ds.map(input_columns="image", operations=trans, num_parallel_workers=8) | |||
| ds = ds.map(input_columns="label", operations=type_cast_op, num_parallel_workers=8) | |||
| data_set = data_set.map(input_columns="image", operations=trans, num_parallel_workers=8) | |||
| data_set = data_set.map(input_columns="label", operations=type_cast_op, num_parallel_workers=8) | |||
| # apply batch operations | |||
| ds = ds.batch(batch_size, drop_remainder=True) | |||
| return ds | |||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||
| return data_set | |||
| @@ -17,7 +17,7 @@ Data operations, will be used in run_pretrain.py | |||
| """ | |||
| import os | |||
| import mindspore.common.dtype as mstype | |||
| import mindspore.dataset.engine.datasets as de | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.transforms.c_transforms as C | |||
| from mindspore import log as logger | |||
| from .config import cfg | |||
| @@ -31,65 +31,67 @@ def create_bert_dataset(device_num=1, rank=0, do_shuffle="true", data_dir=None, | |||
| for file_name in files: | |||
| if "tfrecord" in file_name: | |||
| data_files.append(os.path.join(data_dir, file_name)) | |||
| ds = de.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None, | |||
| columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", | |||
| "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"], | |||
| shuffle=de.Shuffle.FILES if do_shuffle == "true" else False, | |||
| num_shards=device_num, shard_id=rank, shard_equal_rows=True) | |||
| ori_dataset_size = ds.get_dataset_size() | |||
| data_set = ds.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None, | |||
| columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", | |||
| "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"], | |||
| shuffle=ds.Shuffle.FILES if do_shuffle == "true" else False, | |||
| num_shards=device_num, shard_id=rank, shard_equal_rows=True) | |||
| ori_dataset_size = data_set.get_dataset_size() | |||
| print('origin dataset size: ', ori_dataset_size) | |||
| type_cast_op = C.TypeCast(mstype.int32) | |||
| ds = ds.map(operations=type_cast_op, input_columns="masked_lm_ids") | |||
| ds = ds.map(operations=type_cast_op, input_columns="masked_lm_positions") | |||
| ds = ds.map(operations=type_cast_op, input_columns="next_sentence_labels") | |||
| ds = ds.map(operations=type_cast_op, input_columns="segment_ids") | |||
| ds = ds.map(operations=type_cast_op, input_columns="input_mask") | |||
| ds = ds.map(operations=type_cast_op, input_columns="input_ids") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_ids") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_positions") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="next_sentence_labels") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="input_mask") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="input_ids") | |||
| # apply batch operations | |||
| ds = ds.batch(cfg.batch_size, drop_remainder=True) | |||
| logger.info("data size: {}".format(ds.get_dataset_size())) | |||
| logger.info("repeat count: {}".format(ds.get_repeat_count())) | |||
| return ds | |||
| data_set = data_set.batch(cfg.batch_size, drop_remainder=True) | |||
| logger.info("data size: {}".format(data_set.get_dataset_size())) | |||
| logger.info("repeat count: {}".format(data_set.get_repeat_count())) | |||
| return data_set | |||
| def create_ner_dataset(batch_size=1, repeat_count=1, assessment_method="accuracy", | |||
| data_file_path=None, schema_file_path=None, do_shuffle=True): | |||
| """create finetune or evaluation dataset""" | |||
| type_cast_op = C.TypeCast(mstype.int32) | |||
| ds = de.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, | |||
| columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"], shuffle=do_shuffle) | |||
| data_set = ds.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, | |||
| columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"], | |||
| shuffle=do_shuffle) | |||
| if assessment_method == "Spearman_correlation": | |||
| type_cast_op_float = C.TypeCast(mstype.float32) | |||
| ds = ds.map(operations=type_cast_op_float, input_columns="label_ids") | |||
| data_set = data_set.map(operations=type_cast_op_float, input_columns="label_ids") | |||
| else: | |||
| ds = ds.map(operations=type_cast_op, input_columns="label_ids") | |||
| ds = ds.map(operations=type_cast_op, input_columns="segment_ids") | |||
| ds = ds.map(operations=type_cast_op, input_columns="input_mask") | |||
| ds = ds.map(operations=type_cast_op, input_columns="input_ids") | |||
| ds = ds.repeat(repeat_count) | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="label_ids") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="input_mask") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="input_ids") | |||
| data_set = data_set.repeat(repeat_count) | |||
| # apply batch operations | |||
| ds = ds.batch(batch_size, drop_remainder=True) | |||
| return ds | |||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||
| return data_set | |||
| def create_classification_dataset(batch_size=1, repeat_count=1, assessment_method="accuracy", | |||
| data_file_path=None, schema_file_path=None, do_shuffle=True): | |||
| """create finetune or evaluation dataset""" | |||
| type_cast_op = C.TypeCast(mstype.int32) | |||
| ds = de.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, | |||
| columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"], shuffle=do_shuffle) | |||
| data_set = ds.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, | |||
| columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"], | |||
| shuffle=do_shuffle) | |||
| if assessment_method == "Spearman_correlation": | |||
| type_cast_op_float = C.TypeCast(mstype.float32) | |||
| ds = ds.map(operations=type_cast_op_float, input_columns="label_ids") | |||
| data_set = data_set.map(operations=type_cast_op_float, input_columns="label_ids") | |||
| else: | |||
| ds = ds.map(operations=type_cast_op, input_columns="label_ids") | |||
| ds = ds.map(operations=type_cast_op, input_columns="segment_ids") | |||
| ds = ds.map(operations=type_cast_op, input_columns="input_mask") | |||
| ds = ds.map(operations=type_cast_op, input_columns="input_ids") | |||
| ds = ds.repeat(repeat_count) | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="label_ids") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="input_mask") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="input_ids") | |||
| data_set = data_set.repeat(repeat_count) | |||
| # apply batch operations | |||
| ds = ds.batch(batch_size, drop_remainder=True) | |||
| return ds | |||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||
| return data_set | |||
| def generator_squad(data_features): | |||
| @@ -102,20 +104,20 @@ def create_squad_dataset(batch_size=1, repeat_count=1, data_file_path=None, sche | |||
| """create finetune or evaluation dataset""" | |||
| type_cast_op = C.TypeCast(mstype.int32) | |||
| if is_training: | |||
| ds = de.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, | |||
| columns_list=["input_ids", "input_mask", "segment_ids", "start_positions", | |||
| "end_positions", "unique_ids", "is_impossible"], | |||
| shuffle=do_shuffle) | |||
| ds = ds.map(operations=type_cast_op, input_columns="start_positions") | |||
| ds = ds.map(operations=type_cast_op, input_columns="end_positions") | |||
| data_set = ds.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, | |||
| columns_list=["input_ids", "input_mask", "segment_ids", "start_positions", | |||
| "end_positions", "unique_ids", "is_impossible"], | |||
| shuffle=do_shuffle) | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="start_positions") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="end_positions") | |||
| else: | |||
| ds = de.GeneratorDataset(generator_squad(data_file_path), shuffle=do_shuffle, | |||
| column_names=["input_ids", "input_mask", "segment_ids", "unique_ids"]) | |||
| ds = ds.map(operations=type_cast_op, input_columns="segment_ids") | |||
| ds = ds.map(operations=type_cast_op, input_columns="input_mask") | |||
| ds = ds.map(operations=type_cast_op, input_columns="input_ids") | |||
| ds = ds.map(operations=type_cast_op, input_columns="unique_ids") | |||
| ds = ds.repeat(repeat_count) | |||
| data_set = ds.GeneratorDataset(generator_squad(data_file_path), shuffle=do_shuffle, | |||
| column_names=["input_ids", "input_mask", "segment_ids", "unique_ids"]) | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="input_mask") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="input_ids") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="unique_ids") | |||
| data_set = data_set.repeat(repeat_count) | |||
| # apply batch operations | |||
| ds = ds.batch(batch_size, drop_remainder=True) | |||
| return ds | |||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||
| return data_set | |||
| @@ -17,7 +17,7 @@ Data operations, will be used in run_pretrain.py | |||
| """ | |||
| import os | |||
| import mindspore.common.dtype as mstype | |||
| import mindspore.dataset.engine.datasets as de | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.transforms.c_transforms as C | |||
| from mindspore import log as logger | |||
| from .bert_net_config import bert_net_cfg | |||
| @@ -32,96 +32,96 @@ def create_bert_dataset(device_num=1, rank=0, do_shuffle="true", data_dir=None, | |||
| if "tfrecord" in file_name: | |||
| data_files.append(os.path.join(data_dir, file_name)) | |||
| data_files = sorted(data_files) | |||
| ds = de.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None, | |||
| columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", | |||
| "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"], | |||
| shuffle=de.Shuffle.FILES if do_shuffle == "true" else False, | |||
| num_shards=device_num, shard_id=rank, shard_equal_rows=False) | |||
| ori_dataset_size = ds.get_dataset_size() | |||
| data_set = ds.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None, | |||
| columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", | |||
| "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"], | |||
| shuffle=ds.Shuffle.FILES if do_shuffle == "true" else False, | |||
| num_shards=device_num, shard_id=rank, shard_equal_rows=False) | |||
| ori_dataset_size = data_set.get_dataset_size() | |||
| print('origin dataset size: ', ori_dataset_size) | |||
| type_cast_op = C.TypeCast(mstype.int32) | |||
| ds = ds.map(operations=type_cast_op, input_columns="masked_lm_ids") | |||
| ds = ds.map(operations=type_cast_op, input_columns="masked_lm_positions") | |||
| ds = ds.map(operations=type_cast_op, input_columns="next_sentence_labels") | |||
| ds = ds.map(operations=type_cast_op, input_columns="segment_ids") | |||
| ds = ds.map(operations=type_cast_op, input_columns="input_mask") | |||
| ds = ds.map(operations=type_cast_op, input_columns="input_ids") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_ids") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_positions") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="next_sentence_labels") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="input_mask") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="input_ids") | |||
| # apply batch operations | |||
| ds = ds.batch(bert_net_cfg.batch_size, drop_remainder=True) | |||
| logger.info("data size: {}".format(ds.get_dataset_size())) | |||
| logger.info("repeat count: {}".format(ds.get_repeat_count())) | |||
| return ds | |||
| data_set = data_set.batch(bert_net_cfg.batch_size, drop_remainder=True) | |||
| logger.info("data size: {}".format(data_set.get_dataset_size())) | |||
| logger.info("repeat count: {}".format(data_set.get_repeat_count())) | |||
| return data_set | |||
| def create_ner_dataset(batch_size=1, repeat_count=1, assessment_method="accuracy", | |||
| data_file_path=None, schema_file_path=None): | |||
| """create finetune or evaluation dataset""" | |||
| type_cast_op = C.TypeCast(mstype.int32) | |||
| ds = de.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, | |||
| columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"]) | |||
| data_set = ds.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, | |||
| columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"]) | |||
| if assessment_method == "Spearman_correlation": | |||
| type_cast_op_float = C.TypeCast(mstype.float32) | |||
| ds = ds.map(operations=type_cast_op_float, input_columns="label_ids") | |||
| data_set = data_set.map(operations=type_cast_op_float, input_columns="label_ids") | |||
| else: | |||
| ds = ds.map(operations=type_cast_op, input_columns="label_ids") | |||
| ds = ds.map(operations=type_cast_op, input_columns="segment_ids") | |||
| ds = ds.map(operations=type_cast_op, input_columns="input_mask") | |||
| ds = ds.map(operations=type_cast_op, input_columns="input_ids") | |||
| ds = ds.repeat(repeat_count) | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="label_ids") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="input_mask") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="input_ids") | |||
| data_set = data_set.repeat(repeat_count) | |||
| # apply shuffle operation | |||
| buffer_size = 960 | |||
| ds = ds.shuffle(buffer_size=buffer_size) | |||
| data_set = data_set.shuffle(buffer_size=buffer_size) | |||
| # apply batch operations | |||
| ds = ds.batch(batch_size, drop_remainder=True) | |||
| return ds | |||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||
| return data_set | |||
| def create_classification_dataset(batch_size=1, repeat_count=1, assessment_method="accuracy", | |||
| data_file_path=None, schema_file_path=None): | |||
| """create finetune or evaluation dataset""" | |||
| type_cast_op = C.TypeCast(mstype.int32) | |||
| ds = de.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, | |||
| columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"]) | |||
| data_set = ds.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, | |||
| columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"]) | |||
| if assessment_method == "Spearman_correlation": | |||
| type_cast_op_float = C.TypeCast(mstype.float32) | |||
| ds = ds.map(operations=type_cast_op_float, input_columns="label_ids") | |||
| data_set = data_set.map(operations=type_cast_op_float, input_columns="label_ids") | |||
| else: | |||
| ds = ds.map(operations=type_cast_op, input_columns="label_ids") | |||
| ds = ds.map(operations=type_cast_op, input_columns="segment_ids") | |||
| ds = ds.map(operations=type_cast_op, input_columns="input_mask") | |||
| ds = ds.map(operations=type_cast_op, input_columns="input_ids") | |||
| ds = ds.repeat(repeat_count) | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="label_ids") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="input_mask") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="input_ids") | |||
| data_set = data_set.repeat(repeat_count) | |||
| # apply shuffle operation | |||
| buffer_size = 960 | |||
| ds = ds.shuffle(buffer_size=buffer_size) | |||
| data_set = data_set.shuffle(buffer_size=buffer_size) | |||
| # apply batch operations | |||
| ds = ds.batch(batch_size, drop_remainder=True) | |||
| return ds | |||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||
| return data_set | |||
| def create_squad_dataset(batch_size=1, repeat_count=1, data_file_path=None, schema_file_path=None, is_training=True): | |||
| """create finetune or evaluation dataset""" | |||
| type_cast_op = C.TypeCast(mstype.int32) | |||
| if is_training: | |||
| ds = de.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, | |||
| columns_list=["input_ids", "input_mask", "segment_ids", | |||
| "start_positions", "end_positions", | |||
| "unique_ids", "is_impossible"]) | |||
| ds = ds.map(operations=type_cast_op, input_columns="start_positions") | |||
| ds = ds.map(operations=type_cast_op, input_columns="end_positions") | |||
| data_set = ds.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, | |||
| columns_list=["input_ids", "input_mask", "segment_ids", | |||
| "start_positions", "end_positions", | |||
| "unique_ids", "is_impossible"]) | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="start_positions") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="end_positions") | |||
| else: | |||
| ds = de.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, | |||
| columns_list=["input_ids", "input_mask", "segment_ids", "unique_ids"]) | |||
| ds = ds.map(operations=type_cast_op, input_columns="input_ids") | |||
| ds = ds.map(operations=type_cast_op, input_columns="input_mask") | |||
| ds = ds.map(operations=type_cast_op, input_columns="segment_ids") | |||
| ds = ds.map(operations=type_cast_op, input_columns="segment_ids") | |||
| ds = ds.map(operations=type_cast_op, input_columns="input_mask") | |||
| ds = ds.map(operations=type_cast_op, input_columns="input_ids") | |||
| ds = ds.repeat(repeat_count) | |||
| data_set = ds.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, | |||
| columns_list=["input_ids", "input_mask", "segment_ids", "unique_ids"]) | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="input_ids") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="input_mask") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="input_mask") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="input_ids") | |||
| data_set = data_set.repeat(repeat_count) | |||
| # apply shuffle operation | |||
| buffer_size = 960 | |||
| ds = ds.shuffle(buffer_size=buffer_size) | |||
| data_set = data_set.shuffle(buffer_size=buffer_size) | |||
| # apply batch operations | |||
| ds = ds.batch(batch_size, drop_remainder=True) | |||
| return ds | |||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||
| return data_set | |||
| @@ -22,7 +22,7 @@ import mindspore.ops.operations as P | |||
| from mindspore.common.tensor import Tensor | |||
| from mindspore.train.model import Model | |||
| from mindspore.train.serialization import load_checkpoint, load_param_into_net | |||
| import mindspore.dataset.engine as de | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.transforms.c_transforms as deC | |||
| from mindspore import context | |||
| from src.fasttext_model import FastText | |||
| @@ -73,15 +73,15 @@ class FastTextInferCell(nn.Cell): | |||
| def load_infer_dataset(batch_size, datafile): | |||
| """data loader for infer""" | |||
| ds = de.MindDataset(datafile, columns_list=['src_tokens', 'src_tokens_length', 'label_idx']) | |||
| data_set = ds.MindDataset(datafile, columns_list=['src_tokens', 'src_tokens_length', 'label_idx']) | |||
| type_cast_op = deC.TypeCast(mstype.int32) | |||
| ds = ds.map(operations=type_cast_op, input_columns="src_tokens") | |||
| ds = ds.map(operations=type_cast_op, input_columns="src_tokens_length") | |||
| ds = ds.map(operations=type_cast_op, input_columns="label_idx") | |||
| ds = ds.batch(batch_size=batch_size, drop_remainder=True) | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="src_tokens") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="src_tokens_length") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="label_idx") | |||
| data_set = data_set.batch(batch_size=batch_size, drop_remainder=True) | |||
| return ds | |||
| return data_set | |||
| def run_fasttext_infer(): | |||
| """run infer with FastText""" | |||
| @@ -25,8 +25,10 @@ import spacy | |||
| from sklearn.feature_extraction import FeatureHasher | |||
| from mindspore.mindrecord import FileWriter | |||
| class FastTextDataPreProcess(): | |||
| """FastText data preprocess""" | |||
| def __init__(self, train_path, | |||
| test_file, | |||
| max_length, | |||
| @@ -194,7 +196,6 @@ class FastTextDataPreProcess(): | |||
| if self.text_less in sent_describe and self.text_greater in sent_describe: | |||
| sent_describe = self.str_html.sub('', sent_describe) | |||
| doc = spacy_nlp(sent_describe) | |||
| bows_token = [token.text for token in doc] | |||
| @@ -222,7 +223,7 @@ class FastTextDataPreProcess(): | |||
| def _get_bucket_length(self, x, bts): | |||
| x_len = len(x) | |||
| for index in range(1, len(bts)): | |||
| if bts[index-1] < x_len <= bts[index]: | |||
| if bts[index - 1] < x_len <= bts[index]: | |||
| return bts[index] | |||
| return bts[0] | |||
| @@ -310,7 +311,6 @@ if __name__ == '__main__': | |||
| print("Writing test data to MindRecord file.....") | |||
| for k in args.test_bucket: | |||
| write_to_mindrecord(test_data_example[k], './test_dataset_bs_' + str(k) + '.mindrecord', 1) | |||
| print("All done.....") | |||
| @@ -14,9 +14,10 @@ | |||
| # ============================================================================ | |||
| """FastText data loader""" | |||
| import mindspore.common.dtype as mstype | |||
| import mindspore.dataset.engine as de | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.transforms.c_transforms as deC | |||
| def load_dataset(dataset_path, | |||
| batch_size, | |||
| epoch_count=1, | |||
| @@ -25,38 +26,40 @@ def load_dataset(dataset_path, | |||
| bucket=None, | |||
| shuffle=True): | |||
| """dataset loader""" | |||
| def batch_per_bucket(bucket_length, input_file): | |||
| input_file = input_file +'/train_dataset_bs_' + str(bucket_length) + '.mindrecord' | |||
| input_file = input_file + '/train_dataset_bs_' + str(bucket_length) + '.mindrecord' | |||
| if not input_file: | |||
| raise FileNotFoundError("input file parameter must not be empty.") | |||
| ds = de.MindDataset(input_file, | |||
| columns_list=['src_tokens', 'src_tokens_length', 'label_idx'], | |||
| shuffle=shuffle, | |||
| num_shards=rank_size, | |||
| shard_id=rank_id, | |||
| num_parallel_workers=8) | |||
| ori_dataset_size = ds.get_dataset_size() | |||
| data_set = ds.MindDataset(input_file, | |||
| columns_list=['src_tokens', 'src_tokens_length', 'label_idx'], | |||
| shuffle=shuffle, | |||
| num_shards=rank_size, | |||
| shard_id=rank_id, | |||
| num_parallel_workers=8) | |||
| ori_dataset_size = data_set.get_dataset_size() | |||
| print(f"Dataset size: {ori_dataset_size}") | |||
| repeat_count = epoch_count | |||
| type_cast_op = deC.TypeCast(mstype.int32) | |||
| ds = ds.map(operations=type_cast_op, input_columns="src_tokens") | |||
| ds = ds.map(operations=type_cast_op, input_columns="src_tokens_length") | |||
| ds = ds.map(operations=type_cast_op, input_columns="label_idx") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="src_tokens") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="src_tokens_length") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="label_idx") | |||
| data_set = data_set.rename(input_columns=['src_tokens', 'src_tokens_length', 'label_idx'], | |||
| output_columns=['src_token_text', 'src_tokens_text_length', 'label_idx_tag']) | |||
| data_set = data_set.batch(batch_size, drop_remainder=False) | |||
| data_set = data_set.repeat(repeat_count) | |||
| return data_set | |||
| ds = ds.rename(input_columns=['src_tokens', 'src_tokens_length', 'label_idx'], | |||
| output_columns=['src_token_text', 'src_tokens_text_length', 'label_idx_tag']) | |||
| ds = ds.batch(batch_size, drop_remainder=False) | |||
| ds = ds.repeat(repeat_count) | |||
| return ds | |||
| for i, _ in enumerate(bucket): | |||
| bucket_len = bucket[i] | |||
| ds_per = batch_per_bucket(bucket_len, dataset_path) | |||
| if i == 0: | |||
| ds = ds_per | |||
| data_set = ds_per | |||
| else: | |||
| ds = ds + ds_per | |||
| ds = ds.shuffle(ds.get_dataset_size()) | |||
| ds.channel_name = 'fasttext' | |||
| data_set = data_set + ds_per | |||
| data_set = data_set.shuffle(data_set.get_dataset_size()) | |||
| data_set.channel_name = 'fasttext' | |||
| return ds | |||
| return data_set | |||
| @@ -15,7 +15,7 @@ | |||
| """Dataset loader to feed into model.""" | |||
| import os | |||
| import mindspore.common.dtype as mstype | |||
| import mindspore.dataset.engine as de | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.transforms.c_transforms as deC | |||
| @@ -55,7 +55,7 @@ def _load_dataset(input_files, schema_file, batch_size, sink_mode=False, | |||
| print(f" | Loading {datafile}.") | |||
| if not is_translate: | |||
| ds = de.MindDataset( | |||
| data_set = ds.MindDataset( | |||
| input_files, columns_list=[ | |||
| "src", "src_padding", | |||
| "prev_opt", | |||
| @@ -64,18 +64,18 @@ def _load_dataset(input_files, schema_file, batch_size, sink_mode=False, | |||
| num_parallel_workers=8 | |||
| ) | |||
| ori_dataset_size = ds.get_dataset_size() | |||
| ori_dataset_size = data_set.get_dataset_size() | |||
| print(f" | Dataset size: {ori_dataset_size}.") | |||
| if shuffle: | |||
| ds = ds.shuffle(buffer_size=ori_dataset_size // 20) | |||
| data_set = data_set.shuffle(buffer_size=ori_dataset_size // 20) | |||
| type_cast_op = deC.TypeCast(mstype.int32) | |||
| ds = ds.map(input_columns="src", operations=type_cast_op, num_parallel_workers=8) | |||
| ds = ds.map(input_columns="src_padding", operations=type_cast_op, num_parallel_workers=8) | |||
| ds = ds.map(input_columns="prev_opt", operations=type_cast_op, num_parallel_workers=8) | |||
| ds = ds.map(input_columns="target", operations=type_cast_op, num_parallel_workers=8) | |||
| ds = ds.map(input_columns="tgt_padding", operations=type_cast_op, num_parallel_workers=8) | |||
| data_set = data_set.map(input_columns="src", operations=type_cast_op, num_parallel_workers=8) | |||
| data_set = data_set.map(input_columns="src_padding", operations=type_cast_op, num_parallel_workers=8) | |||
| data_set = data_set.map(input_columns="prev_opt", operations=type_cast_op, num_parallel_workers=8) | |||
| data_set = data_set.map(input_columns="target", operations=type_cast_op, num_parallel_workers=8) | |||
| data_set = data_set.map(input_columns="tgt_padding", operations=type_cast_op, num_parallel_workers=8) | |||
| ds = ds.rename( | |||
| data_set = data_set.rename( | |||
| input_columns=["src", | |||
| "src_padding", | |||
| "prev_opt", | |||
| @@ -87,9 +87,9 @@ def _load_dataset(input_files, schema_file, batch_size, sink_mode=False, | |||
| "target_eos_ids", | |||
| "target_eos_mask"] | |||
| ) | |||
| ds = ds.batch(batch_size, drop_remainder=drop_remainder) | |||
| data_set = data_set.batch(batch_size, drop_remainder=drop_remainder) | |||
| else: | |||
| ds = de.MindDataset( | |||
| data_set = ds.MindDataset( | |||
| input_files, columns_list=[ | |||
| "src", "src_padding" | |||
| ], | |||
| @@ -97,23 +97,23 @@ def _load_dataset(input_files, schema_file, batch_size, sink_mode=False, | |||
| num_parallel_workers=8 | |||
| ) | |||
| ori_dataset_size = ds.get_dataset_size() | |||
| ori_dataset_size = data_set.get_dataset_size() | |||
| print(f" | Dataset size: {ori_dataset_size}.") | |||
| if shuffle: | |||
| ds = ds.shuffle(buffer_size=ori_dataset_size // 20) | |||
| data_set = data_set.shuffle(buffer_size=ori_dataset_size // 20) | |||
| type_cast_op = deC.TypeCast(mstype.int32) | |||
| ds = ds.map(input_columns="src", operations=type_cast_op, num_parallel_workers=8) | |||
| ds = ds.map(input_columns="src_padding", operations=type_cast_op, num_parallel_workers=8) | |||
| data_set = data_set.map(input_columns="src", operations=type_cast_op, num_parallel_workers=8) | |||
| data_set = data_set.map(input_columns="src_padding", operations=type_cast_op, num_parallel_workers=8) | |||
| ds = ds.rename( | |||
| data_set = data_set.rename( | |||
| input_columns=["src", | |||
| "src_padding"], | |||
| output_columns=["source_eos_ids", | |||
| "source_eos_mask"] | |||
| ) | |||
| ds = ds.batch(batch_size, drop_remainder=drop_remainder) | |||
| data_set = data_set.batch(batch_size, drop_remainder=drop_remainder) | |||
| return ds | |||
| return data_set | |||
| def load_dataset(data_files: list, schema: str, batch_size: int, sink_mode: bool, | |||
| @@ -14,7 +14,7 @@ | |||
| # ============================================================================ | |||
| """Dataset loader to feed into model.""" | |||
| import mindspore.common.dtype as mstype | |||
| import mindspore.dataset.engine as de | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.transforms.c_transforms as deC | |||
| @@ -45,7 +45,7 @@ def _load_dataset(input_files, batch_size, epoch_count=1, | |||
| for datafile in input_files: | |||
| print(f" | Loading {datafile}.") | |||
| ds = de.TFRecordDataset( | |||
| data_set = ds.TFRecordDataset( | |||
| input_files, | |||
| columns_list=[ | |||
| "src", "src_padding", | |||
| @@ -55,19 +55,19 @@ def _load_dataset(input_files, batch_size, epoch_count=1, | |||
| shuffle=shuffle, num_shards=rank_size, shard_id=rank_id, | |||
| shard_equal_rows=True, num_parallel_workers=8) | |||
| ori_dataset_size = ds.get_dataset_size() | |||
| ori_dataset_size = data_set.get_dataset_size() | |||
| print(f" | Dataset size: {ori_dataset_size}.") | |||
| repeat_count = epoch_count | |||
| type_cast_op = deC.TypeCast(mstype.int32) | |||
| ds = ds.map(operations=type_cast_op, input_columns="src") | |||
| ds = ds.map(operations=type_cast_op, input_columns="src_padding") | |||
| ds = ds.map(operations=type_cast_op, input_columns="prev_opt") | |||
| ds = ds.map(operations=type_cast_op, input_columns="prev_padding") | |||
| ds = ds.map(operations=type_cast_op, input_columns="target") | |||
| ds = ds.map(operations=type_cast_op, input_columns="tgt_padding") | |||
| ds = ds.rename( | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="src") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="src_padding") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="prev_opt") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="prev_padding") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="target") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="tgt_padding") | |||
| data_set = data_set.rename( | |||
| input_columns=["src", | |||
| "src_padding", | |||
| "prev_opt", | |||
| @@ -82,11 +82,11 @@ def _load_dataset(input_files, batch_size, epoch_count=1, | |||
| "target_eos_mask"] | |||
| ) | |||
| ds = ds.batch(batch_size, drop_remainder=True) | |||
| ds = ds.repeat(repeat_count) | |||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||
| data_set = data_set.repeat(repeat_count) | |||
| ds.channel_name = 'transformer' | |||
| return ds | |||
| data_set.channel_name = 'transformer' | |||
| return data_set | |||
| def load_dataset(data_files: list, batch_size: int, epoch_count: int, | |||
| @@ -14,7 +14,7 @@ | |||
| # ============================================================================ | |||
| """Dataset loader to feed into model.""" | |||
| import mindspore.common.dtype as mstype | |||
| import mindspore.dataset.engine as de | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.transforms.c_transforms as deC | |||
| @@ -45,7 +45,7 @@ def _load_dataset(input_files, batch_size, epoch_count=1, | |||
| for datafile in input_files: | |||
| print(f" | Loading {datafile}.") | |||
| ds = de.TFRecordDataset( | |||
| data_set = ds.TFRecordDataset( | |||
| input_files, | |||
| columns_list=[ | |||
| "src", "src_padding", | |||
| @@ -55,19 +55,19 @@ def _load_dataset(input_files, batch_size, epoch_count=1, | |||
| shuffle=shuffle, num_shards=rank_size, shard_id=rank_id, | |||
| shard_equal_rows=True, num_parallel_workers=8) | |||
| ori_dataset_size = ds.get_dataset_size() | |||
| ori_dataset_size = data_set.get_dataset_size() | |||
| print(f" | Dataset size: {ori_dataset_size}.") | |||
| repeat_count = epoch_count | |||
| type_cast_op = deC.TypeCast(mstype.int32) | |||
| ds = ds.map(input_columns="src", operations=type_cast_op) | |||
| ds = ds.map(input_columns="src_padding", operations=type_cast_op) | |||
| ds = ds.map(input_columns="prev_opt", operations=type_cast_op) | |||
| ds = ds.map(input_columns="prev_padding", operations=type_cast_op) | |||
| ds = ds.map(input_columns="target", operations=type_cast_op) | |||
| ds = ds.map(input_columns="tgt_padding", operations=type_cast_op) | |||
| ds = ds.rename( | |||
| data_set = data_set.map(input_columns="src", operations=type_cast_op) | |||
| data_set = data_set.map(input_columns="src_padding", operations=type_cast_op) | |||
| data_set = data_set.map(input_columns="prev_opt", operations=type_cast_op) | |||
| data_set = data_set.map(input_columns="prev_padding", operations=type_cast_op) | |||
| data_set = data_set.map(input_columns="target", operations=type_cast_op) | |||
| data_set = data_set.map(input_columns="tgt_padding", operations=type_cast_op) | |||
| data_set = data_set.rename( | |||
| input_columns=["src", | |||
| "src_padding", | |||
| "prev_opt", | |||
| @@ -82,11 +82,11 @@ def _load_dataset(input_files, batch_size, epoch_count=1, | |||
| "target_eos_mask"] | |||
| ) | |||
| ds = ds.batch(batch_size, drop_remainder=True) | |||
| ds = ds.repeat(repeat_count) | |||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||
| data_set = data_set.repeat(repeat_count) | |||
| ds.channel_name = 'transformer' | |||
| return ds | |||
| data_set.channel_name = 'transformer' | |||
| return data_set | |||
| def load_dataset(data_files: list, batch_size: int, epoch_count: int, | |||
| @@ -18,14 +18,16 @@ | |||
| import os | |||
| from enum import Enum | |||
| import mindspore.common.dtype as mstype | |||
| import mindspore.dataset.engine.datasets as de | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.transforms.c_transforms as C | |||
| class DataType(Enum): | |||
| """Enumerate supported dataset format""" | |||
| TFRECORD = 1 | |||
| MINDRECORD = 2 | |||
| def create_tinybert_dataset(task='td', batch_size=32, device_num=1, rank=0, | |||
| do_shuffle="true", data_dir=None, schema_dir=None, | |||
| data_type=DataType.TFRECORD): | |||
| @@ -47,22 +49,22 @@ def create_tinybert_dataset(task='td', batch_size=32, device_num=1, rank=0, | |||
| shuffle = False | |||
| if data_type == DataType.MINDRECORD: | |||
| ds = de.MindDataset(data_files, columns_list=columns_list, | |||
| shuffle=(do_shuffle == "true"), num_shards=device_num, shard_id=rank) | |||
| data_set = ds.MindDataset(data_files, columns_list=columns_list, | |||
| shuffle=(do_shuffle == "true"), num_shards=device_num, shard_id=rank) | |||
| else: | |||
| ds = de.TFRecordDataset(data_files, schema_dir, columns_list=columns_list, | |||
| shuffle=shuffle, num_shards=device_num, shard_id=rank, | |||
| shard_equal_rows=shard_equal_rows) | |||
| data_set = ds.TFRecordDataset(data_files, schema_dir, columns_list=columns_list, | |||
| shuffle=shuffle, num_shards=device_num, shard_id=rank, | |||
| shard_equal_rows=shard_equal_rows) | |||
| if device_num == 1 and shuffle is True: | |||
| ds = ds.shuffle(10000) | |||
| data_set = data_set.shuffle(10000) | |||
| type_cast_op = C.TypeCast(mstype.int32) | |||
| ds = ds.map(operations=type_cast_op, input_columns="segment_ids") | |||
| ds = ds.map(operations=type_cast_op, input_columns="input_mask") | |||
| ds = ds.map(operations=type_cast_op, input_columns="input_ids") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="input_mask") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="input_ids") | |||
| if task == "td": | |||
| ds = ds.map(operations=type_cast_op, input_columns="label_ids") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="label_ids") | |||
| # apply batch operations | |||
| ds = ds.batch(batch_size, drop_remainder=True) | |||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||
| return ds | |||
| return data_set | |||
| @@ -23,38 +23,41 @@ from mindspore.common.parameter import Parameter | |||
| from mindspore.common.tensor import Tensor | |||
| from mindspore.train.model import Model | |||
| from mindspore.train.serialization import load_checkpoint, load_param_into_net | |||
| import mindspore.dataset.engine as de | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.transforms.c_transforms as deC | |||
| from mindspore import context | |||
| from src.transformer_model import TransformerModel | |||
| from src.eval_config import cfg, transformer_net_cfg | |||
| def load_test_data(batch_size=1, data_file=None): | |||
| """ | |||
| Load test dataset | |||
| """ | |||
| ds = de.MindDataset(data_file, | |||
| columns_list=["source_eos_ids", "source_eos_mask", | |||
| "target_sos_ids", "target_sos_mask", | |||
| "target_eos_ids", "target_eos_mask"], | |||
| shuffle=False) | |||
| data_set = ds.MindDataset(data_file, | |||
| columns_list=["source_eos_ids", "source_eos_mask", | |||
| "target_sos_ids", "target_sos_mask", | |||
| "target_eos_ids", "target_eos_mask"], | |||
| shuffle=False) | |||
| type_cast_op = deC.TypeCast(mstype.int32) | |||
| ds = ds.map(operations=type_cast_op, input_columns="source_eos_ids") | |||
| ds = ds.map(operations=type_cast_op, input_columns="source_eos_mask") | |||
| ds = ds.map(operations=type_cast_op, input_columns="target_sos_ids") | |||
| ds = ds.map(operations=type_cast_op, input_columns="target_sos_mask") | |||
| ds = ds.map(operations=type_cast_op, input_columns="target_eos_ids") | |||
| ds = ds.map(operations=type_cast_op, input_columns="target_eos_mask") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="source_eos_ids") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="source_eos_mask") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="target_sos_ids") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="target_sos_mask") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="target_eos_ids") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="target_eos_mask") | |||
| # apply batch operations | |||
| ds = ds.batch(batch_size, drop_remainder=True) | |||
| ds.channel_name = 'transformer' | |||
| return ds | |||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||
| data_set.channel_name = 'transformer' | |||
| return data_set | |||
| class TransformerInferCell(nn.Cell): | |||
| """ | |||
| Encapsulation class of transformer network infer. | |||
| """ | |||
| def __init__(self, network): | |||
| super(TransformerInferCell, self).__init__(auto_prefix=False) | |||
| self.network = network | |||
| @@ -65,6 +68,7 @@ class TransformerInferCell(nn.Cell): | |||
| predicted_ids = self.network(source_ids, source_mask) | |||
| return predicted_ids | |||
| def load_weights(model_path): | |||
| """ | |||
| Load checkpoint as parameter dict, support both npz file and mindspore checkpoint file. | |||
| @@ -93,6 +97,7 @@ def load_weights(model_path): | |||
| parameter_dict[name] = Parameter(Tensor(weights[name]), name=name) | |||
| return parameter_dict | |||
| def run_transformer_eval(): | |||
| """ | |||
| Transformer evaluation. | |||
| @@ -136,5 +141,6 @@ def run_transformer_eval(): | |||
| f.write(" ".join(token_ids) + "\n") | |||
| f.close() | |||
| if __name__ == "__main__": | |||
| run_transformer_eval() | |||
| @@ -21,7 +21,7 @@ from enum import Enum | |||
| import numpy as np | |||
| import pandas as pd | |||
| import mindspore.dataset.engine as de | |||
| import mindspore.dataset as ds | |||
| import mindspore.common.dtype as mstype | |||
| from .config import DataConfig | |||
| @@ -142,8 +142,8 @@ class H5Dataset(): | |||
| X_id = X[:, 0:self.max_length] | |||
| X_va = X[:, self.max_length:] | |||
| yield np.array(X_id.astype(dtype=np.int32)), \ | |||
| np.array(X_va.astype(dtype=np.float32)), \ | |||
| np.array(y.astype(dtype=np.float32)) | |||
| np.array(X_va.astype(dtype=np.float32)), \ | |||
| np.array(y.astype(dtype=np.float32)) | |||
| def _get_h5_dataset(directory, train_mode=True, epochs=1, batch_size=1000): | |||
| @@ -172,9 +172,9 @@ def _get_h5_dataset(directory, train_mode=True, epochs=1, batch_size=1000): | |||
| for _ in range(0, numbers_of_batch, 1): | |||
| yield train_eval_gen.__next__() | |||
| ds = de.GeneratorDataset(_iter_h5_data, ["ids", "weights", "labels"]) | |||
| ds = ds.repeat(epochs) | |||
| return ds | |||
| data_set = ds.GeneratorDataset(_iter_h5_data, ["ids", "weights", "labels"]) | |||
| data_set = data_set.repeat(epochs) | |||
| return data_set | |||
| def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=1000, | |||
| @@ -199,23 +199,23 @@ def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=100 | |||
| shuffle = train_mode | |||
| if rank_size is not None and rank_id is not None: | |||
| ds = de.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), | |||
| columns_list=['feat_ids', 'feat_vals', 'label'], | |||
| num_shards=rank_size, shard_id=rank_id, shuffle=shuffle, | |||
| num_parallel_workers=8) | |||
| data_set = ds.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), | |||
| columns_list=['feat_ids', 'feat_vals', 'label'], | |||
| num_shards=rank_size, shard_id=rank_id, shuffle=shuffle, | |||
| num_parallel_workers=8) | |||
| else: | |||
| ds = de.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), | |||
| columns_list=['feat_ids', 'feat_vals', 'label'], | |||
| shuffle=shuffle, num_parallel_workers=8) | |||
| ds = ds.batch(int(batch_size / line_per_sample), drop_remainder=True) | |||
| ds = ds.map(operations=(lambda x, y, z: (np.array(x).flatten().reshape(batch_size, 39), | |||
| np.array(y).flatten().reshape(batch_size, 39), | |||
| np.array(z).flatten().reshape(batch_size, 1))), | |||
| input_columns=['feat_ids', 'feat_vals', 'label'], | |||
| column_order=['feat_ids', 'feat_vals', 'label'], | |||
| num_parallel_workers=8) | |||
| ds = ds.repeat(epochs) | |||
| return ds | |||
| data_set = ds.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), | |||
| columns_list=['feat_ids', 'feat_vals', 'label'], | |||
| shuffle=shuffle, num_parallel_workers=8) | |||
| data_set = data_set.batch(int(batch_size / line_per_sample), drop_remainder=True) | |||
| data_set = data_set.map(operations=(lambda x, y, z: (np.array(x).flatten().reshape(batch_size, 39), | |||
| np.array(y).flatten().reshape(batch_size, 39), | |||
| np.array(z).flatten().reshape(batch_size, 1))), | |||
| input_columns=['feat_ids', 'feat_vals', 'label'], | |||
| column_order=['feat_ids', 'feat_vals', 'label'], | |||
| num_parallel_workers=8) | |||
| data_set = data_set.repeat(epochs) | |||
| return data_set | |||
| def _get_tf_dataset(directory, train_mode=True, epochs=1, batch_size=1000, | |||
| @@ -242,28 +242,28 @@ def _get_tf_dataset(directory, train_mode=True, epochs=1, batch_size=1000, | |||
| for filename in filenames: | |||
| if file_prefixt_name in filename and 'tfrecord' in filename: | |||
| dataset_files.append(os.path.join(dir_path, filename)) | |||
| schema = de.Schema() | |||
| schema = ds.Schema() | |||
| schema.add_column('feat_ids', de_type=mstype.int32) | |||
| schema.add_column('feat_vals', de_type=mstype.float32) | |||
| schema.add_column('label', de_type=mstype.float32) | |||
| if rank_size is not None and rank_id is not None: | |||
| ds = de.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, | |||
| schema=schema, num_parallel_workers=8, | |||
| num_shards=rank_size, shard_id=rank_id, | |||
| shard_equal_rows=True) | |||
| data_set = ds.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, | |||
| schema=schema, num_parallel_workers=8, | |||
| num_shards=rank_size, shard_id=rank_id, | |||
| shard_equal_rows=True) | |||
| else: | |||
| ds = de.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, | |||
| schema=schema, num_parallel_workers=8) | |||
| ds = ds.batch(int(batch_size / line_per_sample), drop_remainder=True) | |||
| ds = ds.map(operations=(lambda x, y, z: ( | |||
| data_set = ds.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, | |||
| schema=schema, num_parallel_workers=8) | |||
| data_set = data_set.batch(int(batch_size / line_per_sample), drop_remainder=True) | |||
| data_set = data_set.map(operations=(lambda x, y, z: ( | |||
| np.array(x).flatten().reshape(batch_size, 39), | |||
| np.array(y).flatten().reshape(batch_size, 39), | |||
| np.array(z).flatten().reshape(batch_size, 1))), | |||
| input_columns=['feat_ids', 'feat_vals', 'label'], | |||
| column_order=['feat_ids', 'feat_vals', 'label'], | |||
| num_parallel_workers=8) | |||
| ds = ds.repeat(epochs) | |||
| return ds | |||
| input_columns=['feat_ids', 'feat_vals', 'label'], | |||
| column_order=['feat_ids', 'feat_vals', 'label'], | |||
| num_parallel_workers=8) | |||
| data_set = data_set.repeat(epochs) | |||
| return data_set | |||
| def create_dataset(directory, train_mode=True, epochs=1, batch_size=1000, | |||
| @@ -14,13 +14,12 @@ | |||
| # ============================================================================ | |||
| """train_dataset.""" | |||
| import os | |||
| import math | |||
| from enum import Enum | |||
| import numpy as np | |||
| import pandas as pd | |||
| import mindspore.dataset.engine as de | |||
| import mindspore.dataset as ds | |||
| import mindspore.common.dtype as mstype | |||
| @@ -84,9 +83,9 @@ class H5Dataset(): | |||
| yield os.path.join(self._hdf_data_dir, | |||
| self._file_prefix + '_input_part_' + str( | |||
| p) + '.h5'), \ | |||
| os.path.join(self._hdf_data_dir, | |||
| self._file_prefix + '_output_part_' + str( | |||
| p) + '.h5'), i + 1 == len(parts) | |||
| os.path.join(self._hdf_data_dir, | |||
| self._file_prefix + '_output_part_' + str( | |||
| p) + '.h5'), i + 1 == len(parts) | |||
| def _generator(self, X, y, batch_size, shuffle=True): | |||
| """ | |||
| @@ -106,8 +105,7 @@ class H5Dataset(): | |||
| np.random.shuffle(sample_index) | |||
| assert X.shape[0] > 0 | |||
| while True: | |||
| batch_index = sample_index[ | |||
| batch_size * counter: batch_size * (counter + 1)] | |||
| batch_index = sample_index[batch_size * counter: batch_size * (counter + 1)] | |||
| X_batch = X[batch_index] | |||
| y_batch = y[batch_index] | |||
| counter += 1 | |||
| @@ -140,9 +138,8 @@ class H5Dataset(): | |||
| X, y, finished = data_gen.__next__() | |||
| X_id = X[:, 0:self.input_length] | |||
| X_va = X[:, self.input_length:] | |||
| yield np.array(X_id.astype(dtype=np.int32)), np.array( | |||
| X_va.astype(dtype=np.float32)), np.array( | |||
| y.astype(dtype=np.float32)) | |||
| yield np.array(X_id.astype(dtype=np.int32)), np.array(X_va.astype(dtype=np.float32)), np.array( | |||
| y.astype(dtype=np.float32)) | |||
| def _get_h5_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000): | |||
| @@ -164,9 +161,9 @@ def _get_h5_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000): | |||
| for _ in range(0, numbers_of_batch, 1): | |||
| yield train_eval_gen.__next__() | |||
| ds = de.GeneratorDataset(_iter_h5_data(), ["ids", "weights", "labels"]) | |||
| ds = ds.repeat(epochs) | |||
| return ds | |||
| data_set = ds.GeneratorDataset(_iter_h5_data(), ["ids", "weights", "labels"]) | |||
| data_set = data_set.repeat(epochs) | |||
| return data_set | |||
| def _padding_func(batch_size, manual_shape, target_column, field_size=39): | |||
| @@ -174,11 +171,11 @@ def _padding_func(batch_size, manual_shape, target_column, field_size=39): | |||
| get padding_func | |||
| """ | |||
| if manual_shape: | |||
| generate_concat_offset = [item[0]+item[1] for item in manual_shape] | |||
| generate_concat_offset = [item[0] + item[1] for item in manual_shape] | |||
| part_size = int(target_column / len(generate_concat_offset)) | |||
| filled_value = [] | |||
| for i in range(field_size, target_column): | |||
| filled_value.append(generate_concat_offset[i//part_size]-1) | |||
| filled_value.append(generate_concat_offset[i // part_size] - 1) | |||
| print("Filed Value:", filled_value) | |||
| def padding_func(x, y, z): | |||
| @@ -190,7 +187,7 @@ def _padding_func(batch_size, manual_shape, target_column, field_size=39): | |||
| dtype=np.int32) * filled_value | |||
| x_id = np.concatenate([x, x_id.astype(dtype=np.int32)], axis=1) | |||
| mask = np.concatenate( | |||
| [y, np.zeros((batch_size, target_column-39), dtype=np.float32)], axis=1) | |||
| [y, np.zeros((batch_size, target_column - 39), dtype=np.float32)], axis=1) | |||
| return (x_id, mask, z) | |||
| else: | |||
| def padding_func(x, y, z): | |||
| @@ -214,24 +211,25 @@ def _get_tf_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000, | |||
| for filename in filenames: | |||
| if file_prefix_name in filename and "tfrecord" in filename: | |||
| dataset_files.append(os.path.join(dirpath, filename)) | |||
| schema = de.Schema() | |||
| schema = ds.Schema() | |||
| schema.add_column('feat_ids', de_type=mstype.int32) | |||
| schema.add_column('feat_vals', de_type=mstype.float32) | |||
| schema.add_column('label', de_type=mstype.float32) | |||
| if rank_size is not None and rank_id is not None: | |||
| ds = de.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, schema=schema, num_parallel_workers=8, | |||
| num_shards=rank_size, shard_id=rank_id, shard_equal_rows=True) | |||
| data_set = ds.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, schema=schema, | |||
| num_parallel_workers=8, | |||
| num_shards=rank_size, shard_id=rank_id, shard_equal_rows=True) | |||
| else: | |||
| ds = de.TFRecordDataset(dataset_files=dataset_files, | |||
| shuffle=shuffle, schema=schema, num_parallel_workers=8) | |||
| ds = ds.batch(int(batch_size / line_per_sample), | |||
| drop_remainder=True) | |||
| data_set = ds.TFRecordDataset(dataset_files=dataset_files, | |||
| shuffle=shuffle, schema=schema, num_parallel_workers=8) | |||
| data_set = data_set.batch(int(batch_size / line_per_sample), | |||
| drop_remainder=True) | |||
| ds = ds.map(operations=_padding_func(batch_size, manual_shape, target_column), | |||
| input_columns=['feat_ids', 'feat_vals', 'label'], | |||
| column_order=['feat_ids', 'feat_vals', 'label'], num_parallel_workers=8) | |||
| ds = ds.repeat(epochs) | |||
| return ds | |||
| data_set = data_set.map(operations=_padding_func(batch_size, manual_shape, target_column), | |||
| input_columns=['feat_ids', 'feat_vals', 'label'], | |||
| column_order=['feat_ids', 'feat_vals', 'label'], num_parallel_workers=8) | |||
| data_set = data_set.repeat(epochs) | |||
| return data_set | |||
| def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=1000, | |||
| @@ -257,21 +255,21 @@ def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=100 | |||
| shuffle = train_mode | |||
| if rank_size is not None and rank_id is not None: | |||
| ds = de.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), | |||
| columns_list=['feat_ids', 'feat_vals', 'label'], | |||
| num_shards=rank_size, shard_id=rank_id, shuffle=shuffle, | |||
| num_parallel_workers=8) | |||
| data_set = ds.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), | |||
| columns_list=['feat_ids', 'feat_vals', 'label'], | |||
| num_shards=rank_size, shard_id=rank_id, shuffle=shuffle, | |||
| num_parallel_workers=8) | |||
| else: | |||
| ds = de.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), | |||
| columns_list=['feat_ids', 'feat_vals', 'label'], | |||
| shuffle=shuffle, num_parallel_workers=8) | |||
| ds = ds.batch(int(batch_size / line_per_sample), drop_remainder=True) | |||
| ds = ds.map(_padding_func(batch_size, manual_shape, target_column), | |||
| input_columns=['feat_ids', 'feat_vals', 'label'], | |||
| column_order=['feat_ids', 'feat_vals', 'label'], | |||
| num_parallel_workers=8) | |||
| ds = ds.repeat(epochs) | |||
| return ds | |||
| data_set = ds.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), | |||
| columns_list=['feat_ids', 'feat_vals', 'label'], | |||
| shuffle=shuffle, num_parallel_workers=8) | |||
| data_set = data_set.batch(int(batch_size / line_per_sample), drop_remainder=True) | |||
| data_set = data_set.map(_padding_func(batch_size, manual_shape, target_column), | |||
| input_columns=['feat_ids', 'feat_vals', 'label'], | |||
| column_order=['feat_ids', 'feat_vals', 'label'], | |||
| num_parallel_workers=8) | |||
| data_set = data_set.repeat(epochs) | |||
| return data_set | |||
| def _get_vocab_size(target_column_number, worker_size, total_vocab_size, multiply=False, per_vocab_size=None): | |||
| @@ -284,7 +282,7 @@ def _get_vocab_size(target_column_number, worker_size, total_vocab_size, multipl | |||
| 5, 21762, 14, 15, 15030, 61, 12220] | |||
| new_vocabs = inidival_vocabs + [1] * \ | |||
| (target_column_number - len(inidival_vocabs)) | |||
| (target_column_number - len(inidival_vocabs)) | |||
| part_size = int(target_column_number / worker_size) | |||
| # According to the workers, we merge some fields into the same part | |||
| @@ -304,21 +302,21 @@ def _get_vocab_size(target_column_number, worker_size, total_vocab_size, multipl | |||
| # Expands the vocabulary of each field by the multiplier | |||
| if multiply is True: | |||
| cur_sum = sum(new_vocab_size) | |||
| k = total_vocab_size/cur_sum | |||
| k = total_vocab_size / cur_sum | |||
| new_vocab_size = [ | |||
| math.ceil(int(item*k)/worker_size)*worker_size for item in new_vocab_size] | |||
| new_vocab_size = [(item // 8 + 1)*8 for item in new_vocab_size] | |||
| math.ceil(int(item * k) / worker_size) * worker_size for item in new_vocab_size] | |||
| new_vocab_size = [(item // 8 + 1) * 8 for item in new_vocab_size] | |||
| else: | |||
| if total_vocab_size > sum(new_vocab_size): | |||
| new_vocab_size[-1] = total_vocab_size - \ | |||
| sum(new_vocab_size[:-1]) | |||
| sum(new_vocab_size[:-1]) | |||
| new_vocab_size = [item for item in new_vocab_size] | |||
| else: | |||
| raise ValueError( | |||
| "Please providede the correct vocab size, now is {}".format(total_vocab_size)) | |||
| for i in range(worker_size-1): | |||
| for i in range(worker_size - 1): | |||
| off = index_offsets[i] + features[i] | |||
| index_offsets.append(off) | |||
| @@ -17,7 +17,7 @@ | |||
| import os | |||
| import sys | |||
| import mindspore.dataset.engine as de | |||
| import mindspore.dataset as ds | |||
| from mindspore import Model, context | |||
| from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor | |||
| from mindspore.context import ParallelMode | |||
| @@ -88,7 +88,7 @@ def train_and_eval(config): | |||
| print("epochs is {}".format(epochs)) | |||
| if config.full_batch: | |||
| context.set_auto_parallel_context(full_batch=True) | |||
| de.config.set_seed(1) | |||
| ds.config.set_seed(1) | |||
| if config.field_slice: | |||
| compute_manual_shape(config, get_group_size()) | |||
| ds_train = create_dataset(data_path, train_mode=True, epochs=1, | |||
| @@ -17,7 +17,7 @@ | |||
| import os | |||
| import sys | |||
| import mindspore.dataset.engine as de | |||
| import mindspore.dataset as ds | |||
| from mindspore import Model, context | |||
| from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor | |||
| from mindspore.context import ParallelMode | |||
| @@ -92,7 +92,7 @@ def train_and_eval(config): | |||
| print("epochs is {}".format(epochs)) | |||
| if config.full_batch: | |||
| context.set_auto_parallel_context(full_batch=True) | |||
| de.config.set_seed(1) | |||
| ds.config.set_seed(1) | |||
| ds_train = create_dataset(data_path, train_mode=True, epochs=1, | |||
| batch_size=batch_size*get_group_size(), data_type=dataset_type) | |||
| ds_eval = create_dataset(data_path, train_mode=False, epochs=1, | |||
| @@ -18,7 +18,7 @@ import math | |||
| import pickle | |||
| import numpy as np | |||
| import pandas as pd | |||
| import mindspore.dataset.engine as de | |||
| import mindspore.dataset as ds | |||
| import mindspore.common.dtype as mstype | |||
| @@ -97,8 +97,7 @@ class H5Dataset(): | |||
| np.random.shuffle(sample_index) | |||
| assert X.shape[0] > 0 | |||
| while True: | |||
| batch_index = sample_index[batch_size * counter:batch_size * | |||
| (counter + 1)] | |||
| batch_index = sample_index[batch_size * counter:batch_size * (counter + 1)] | |||
| X_batch = X[batch_index] | |||
| y_batch = y[batch_index] | |||
| counter += 1 | |||
| @@ -135,9 +134,8 @@ class H5Dataset(): | |||
| X, y, finished = data_gen.__next__() | |||
| X_id = X[:, 0:self.input_length] | |||
| X_va = X[:, self.input_length:] | |||
| yield np.array(X_id.astype(dtype=np.int32)), np.array( | |||
| X_va.astype(dtype=np.float32)), np.array( | |||
| y.astype(dtype=np.float32)) | |||
| yield np.array(X_id.astype(dtype=np.int32)), np.array(X_va.astype(dtype=np.float32)), np.array( | |||
| y.astype(dtype=np.float32)) | |||
| def _get_h5_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000): | |||
| @@ -159,10 +157,10 @@ def _get_h5_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000): | |||
| for _ in range(0, numbers_of_batch, 1): | |||
| yield train_eval_gen.__next__() | |||
| ds = de.GeneratorDataset(_iter_h5_data(), | |||
| ["ids", "weights", "labels"]) | |||
| ds = ds.repeat(epochs) | |||
| return ds | |||
| data_set = ds.GeneratorDataset(_iter_h5_data(), | |||
| ["ids", "weights", "labels"]) | |||
| data_set = data_set.repeat(epochs) | |||
| return data_set | |||
| def _get_tf_dataset(data_dir, | |||
| @@ -184,7 +182,7 @@ def _get_tf_dataset(data_dir, | |||
| for filename in filenames: | |||
| if file_prefix_name in filename and "tfrecord" in filename: | |||
| dataset_files.append(os.path.join(dirpath, filename)) | |||
| schema = de.Schema() | |||
| schema = ds.Schema() | |||
| float_key_list = ["label", "continue_val"] | |||
| @@ -199,19 +197,19 @@ def _get_tf_dataset(data_dir, | |||
| schema.add_column(key, de_type=ms_dtype) | |||
| if rank_size is not None and rank_id is not None: | |||
| ds = de.TFRecordDataset(dataset_files=dataset_files, | |||
| shuffle=shuffle, | |||
| schema=schema, | |||
| num_parallel_workers=8, | |||
| num_shards=rank_size, | |||
| shard_id=rank_id, | |||
| shard_equal_rows=True) | |||
| data_set = ds.TFRecordDataset(dataset_files=dataset_files, | |||
| shuffle=shuffle, | |||
| schema=schema, | |||
| num_parallel_workers=8, | |||
| num_shards=rank_size, | |||
| shard_id=rank_id, | |||
| shard_equal_rows=True) | |||
| else: | |||
| ds = de.TFRecordDataset(dataset_files=dataset_files, | |||
| shuffle=shuffle, | |||
| schema=schema, | |||
| num_parallel_workers=8) | |||
| ds = ds.batch(int(batch_size / line_per_sample), drop_remainder=True) | |||
| data_set = ds.TFRecordDataset(dataset_files=dataset_files, | |||
| shuffle=shuffle, | |||
| schema=schema, | |||
| num_parallel_workers=8) | |||
| data_set = data_set.batch(int(batch_size / line_per_sample), drop_remainder=True) | |||
| operations_list = [] | |||
| for key in columns_list: | |||
| @@ -249,7 +247,7 @@ def _get_tf_dataset(data_dir, | |||
| u = np.array(u).flatten().reshape(batch_size, -1) | |||
| return a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u | |||
| ds = ds.map( | |||
| data_set = data_set.map( | |||
| operations=mixup, | |||
| input_columns=[ | |||
| 'label', 'continue_val', 'indicator_id', 'emb_128_id', | |||
| @@ -275,8 +273,8 @@ def _get_tf_dataset(data_dir, | |||
| ], | |||
| num_parallel_workers=8) | |||
| ds = ds.repeat(epochs) | |||
| return ds | |||
| data_set = data_set.repeat(epochs) | |||
| return data_set | |||
| def compute_emb_dim(config): | |||
| @@ -24,16 +24,17 @@ import cv2 | |||
| import numpy as np | |||
| import pycocotools.coco as coco | |||
| import mindspore.dataset.engine.datasets as de | |||
| import mindspore.dataset as ds | |||
| from mindspore import log as logger | |||
| from mindspore.mindrecord import FileWriter | |||
| from src.image import color_aug, get_affine_transform, affine_transform | |||
| from src.image import gaussian_radius, draw_umich_gaussian, draw_msra_gaussian, draw_dense_reg | |||
| from src.visual import visual_image | |||
| _current_dir = os.path.dirname(os.path.realpath(__file__)) | |||
| class COCOHP(de.Dataset): | |||
| class COCOHP(ds.Dataset): | |||
| """ | |||
| Encapsulation class of COCO person keypoints datast. | |||
| Initilize and preprocess of image for training and testing. | |||
| @@ -47,6 +48,7 @@ class COCOHP(de.Dataset): | |||
| Returns: | |||
| Prepocessed training or testing dataset for CenterNet network. | |||
| """ | |||
| def __init__(self, data_opt, run_mode="train", net_opt=None, enable_visual_image=False, save_path=None): | |||
| super(COCOHP, self).__init__() | |||
| self._data_rng = np.random.RandomState(123) | |||
| @@ -64,7 +66,6 @@ class COCOHP(de.Dataset): | |||
| if not os.path.exists(self.save_path): | |||
| os.makedirs(self.save_path) | |||
| def init(self, data_dir, keep_res=False, flip_test=False): | |||
| """initailize additional info""" | |||
| logger.info('Initializing coco 2017 {} data.'.format(self.run_mode)) | |||
| @@ -124,7 +125,7 @@ class COCOHP(de.Dataset): | |||
| for img_id in self.images: | |||
| image_info = self.coco.loadImgs([img_id]) | |||
| annos = self.coco.loadAnns(self.anns[img_id]) | |||
| #get image | |||
| # get image | |||
| img_name = image_info[0]['file_name'] | |||
| img_name = os.path.join(self.image_path, img_name) | |||
| with open(img_name, 'rb') as f: | |||
| @@ -147,19 +148,16 @@ class COCOHP(de.Dataset): | |||
| writer.commit() | |||
| logger.info("Create Mindrecord Done, at {}".format(mindrecord_dir)) | |||
| def _coco_box_to_bbox(self, box): | |||
| bbox = np.array([box[0], box[1], box[0] + box[2], box[1] + box[3]], dtype=np.float32) | |||
| return bbox | |||
| def _get_border(self, border, size): | |||
| i = 1 | |||
| while size - border // i <= border // i: | |||
| i *= 2 | |||
| return border // i | |||
| def __getitem__(self, index): | |||
| img_id = self.images[index] | |||
| file_name = self.coco.loadImgs(ids=[img_id])[0]['file_name'] | |||
| @@ -169,7 +167,6 @@ class COCOHP(de.Dataset): | |||
| ret = (img, image_id) | |||
| return ret | |||
| def pre_process_for_test(self, image, img_id, scale, meta=None): | |||
| """image pre-process for evaluation""" | |||
| b, h, w, ch = image.shape | |||
| @@ -249,7 +246,6 @@ class COCOHP(de.Dataset): | |||
| return images, meta | |||
| def preprocess_fn(self, img, num_objects, keypoints, bboxes, category_id): | |||
| """image pre-process and augmentation""" | |||
| num_objs = min(num_objects, self.data_opt.max_objs) | |||
| @@ -269,12 +265,12 @@ class COCOHP(de.Dataset): | |||
| else: | |||
| sf = self.data_opt.scale | |||
| cf = self.data_opt.shift | |||
| c[0] += s * np.clip(np.random.randn()*cf, -2*cf, 2*cf) | |||
| c[1] += s * np.clip(np.random.randn()*cf, -2*cf, 2*cf) | |||
| s = s * np.clip(np.random.randn()*sf + 1, 1 - sf, 1 + sf) | |||
| c[0] += s * np.clip(np.random.randn() * cf, -2 * cf, 2 * cf) | |||
| c[1] += s * np.clip(np.random.randn() * cf, -2 * cf, 2 * cf) | |||
| s = s * np.clip(np.random.randn() * sf + 1, 1 - sf, 1 + sf) | |||
| if np.random.random() < self.data_opt.aug_rot: | |||
| rf = self.data_opt.rotate | |||
| rot = np.clip(np.random.randn()*rf, -rf*2, rf*2) | |||
| rot = np.clip(np.random.randn() * rf, -rf * 2, rf * 2) | |||
| if np.random.random() < self.data_opt.flip_prop: | |||
| flipped = True | |||
| @@ -323,7 +319,7 @@ class COCOHP(de.Dataset): | |||
| cls_id = int(category_id[k]) - 1 | |||
| pts = np.array(keypoints[k], np.float32).reshape(num_joints, 3) | |||
| if flipped: | |||
| bbox[[0, 2]] = width - bbox[[2, 0]] - 1 # index begin from zero | |||
| bbox[[0, 2]] = width - bbox[[2, 0]] - 1 # index begin from zero | |||
| pts[:, 0] = width - pts[:, 0] - 1 | |||
| for e in self.data_opt.flip_idx: | |||
| pts[e[0]], pts[e[1]] = pts[e[1]].copy(), pts[e[0]].copy() | |||
| @@ -360,7 +356,7 @@ class COCOHP(de.Dataset): | |||
| if pts[j, 2] > 0: | |||
| pts[j, :2] = affine_transform(pts[j, :2], trans_output_rot) | |||
| if pts[j, 0] >= 0 and pts[j, 0] < output_res and \ | |||
| pts[j, 1] >= 0 and pts[j, 1] < output_res: | |||
| pts[j, 1] >= 0 and pts[j, 1] < output_res: | |||
| kps[k, j * 2: j * 2 + 2] = pts[j, :2] - ct_int | |||
| kps_mask[k, j * 2: j * 2 + 2] = 1 | |||
| pt_int = pts[j, :2].astype(np.int32) | |||
| @@ -399,7 +395,6 @@ class COCOHP(de.Dataset): | |||
| visual_image(out_img, ground_truth, self.save_path, ratio=self.data_opt.input_res[0] // output_res) | |||
| return ret | |||
| def create_train_dataset(self, mindrecord_dir, prefix="coco_hp.train.mind", batch_size=1, | |||
| device_num=1, rank=0, num_parallel_workers=1, do_shuffle=True): | |||
| """create train dataset based on mindrecord file""" | |||
| @@ -415,41 +410,43 @@ class COCOHP(de.Dataset): | |||
| raise ValueError('data_dir {} have no data files'.format(mindrecord_dir)) | |||
| columns = ["image", "num_objects", "keypoints", "bbox", "category_id"] | |||
| ds = de.MindDataset(data_files, | |||
| columns_list=columns, | |||
| num_parallel_workers=num_parallel_workers, shuffle=do_shuffle, | |||
| num_shards=device_num, shard_id=rank) | |||
| ori_dataset_size = ds.get_dataset_size() | |||
| data_set = ds.MindDataset(data_files, | |||
| columns_list=columns, | |||
| num_parallel_workers=num_parallel_workers, shuffle=do_shuffle, | |||
| num_shards=device_num, shard_id=rank) | |||
| ori_dataset_size = data_set.get_dataset_size() | |||
| logger.info('origin dataset size: {}'.format(ori_dataset_size)) | |||
| ds = ds.map(operations=self.preprocess_fn, | |||
| input_columns=["image", "num_objects", "keypoints", "bbox", "category_id"], | |||
| output_columns=["image", "hm", "reg_mask", "ind", "wh", "kps", "kps_mask", | |||
| "reg", "hm_hp", "hp_offset", "hp_ind", "hp_mask"], | |||
| column_order=["image", "hm", "reg_mask", "ind", "wh", "kps", "kps_mask", | |||
| "reg", "hm_hp", "hp_offset", "hp_ind", "hp_mask"], | |||
| num_parallel_workers=num_parallel_workers, | |||
| python_multiprocessing=True) | |||
| ds = ds.batch(batch_size, drop_remainder=True, num_parallel_workers=8) | |||
| logger.info("data size: {}".format(ds.get_dataset_size())) | |||
| logger.info("repeat count: {}".format(ds.get_repeat_count())) | |||
| return ds | |||
| data_set = data_set.map(operations=self.preprocess_fn, | |||
| input_columns=["image", "num_objects", "keypoints", "bbox", "category_id"], | |||
| output_columns=["image", "hm", "reg_mask", "ind", "wh", "kps", "kps_mask", | |||
| "reg", "hm_hp", "hp_offset", "hp_ind", "hp_mask"], | |||
| column_order=["image", "hm", "reg_mask", "ind", "wh", "kps", "kps_mask", | |||
| "reg", "hm_hp", "hp_offset", "hp_ind", "hp_mask"], | |||
| num_parallel_workers=num_parallel_workers, | |||
| python_multiprocessing=True) | |||
| data_set = data_set.batch(batch_size, drop_remainder=True, num_parallel_workers=8) | |||
| logger.info("data size: {}".format(data_set.get_dataset_size())) | |||
| logger.info("repeat count: {}".format(data_set.get_repeat_count())) | |||
| return data_set | |||
| def create_eval_dataset(self, batch_size=1, num_parallel_workers=1): | |||
| """create testing dataset based on coco format""" | |||
| def generator(): | |||
| for i in range(self.num_samples): | |||
| yield self.__getitem__(i) | |||
| column = ["image", "image_id"] | |||
| ds = de.GeneratorDataset(generator, column, num_parallel_workers=num_parallel_workers) | |||
| ds = ds.batch(batch_size, drop_remainder=True, num_parallel_workers=8) | |||
| return ds | |||
| data_set = ds.GeneratorDataset(generator, column, num_parallel_workers=num_parallel_workers) | |||
| data_set = data_set.batch(batch_size, drop_remainder=True, num_parallel_workers=8) | |||
| return data_set | |||
| if __name__ == '__main__': | |||
| # Convert coco2017 dataset to mindrecord to improve performance on host | |||
| from src.config import dataset_config | |||
| parser = argparse.ArgumentParser(description='CenterNet MindRecord dataset') | |||
| parser.add_argument("--coco_data_dir", type=str, default="", help="Coco dataset directory.") | |||
| parser.add_argument("--mindrecord_dir", type=str, default="", help="MindRecord dataset dir.") | |||
| @@ -17,7 +17,7 @@ create train or eval dataset. | |||
| """ | |||
| import os | |||
| import mindspore.common.dtype as mstype | |||
| import mindspore.dataset.engine as de | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.transforms.vision.c_transforms as C | |||
| import mindspore.dataset.transforms.vision.py_transforms as P | |||
| import mindspore.dataset.transforms.c_transforms as C2 | |||
| @@ -41,18 +41,18 @@ def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch | |||
| rank_size = int(os.getenv("RANK_SIZE")) | |||
| rank_id = int(os.getenv("RANK_ID")) | |||
| if rank_size == 1: | |||
| ds = de.MindDataset( | |||
| data_set = ds.MindDataset( | |||
| dataset_path, num_parallel_workers=8, shuffle=True) | |||
| else: | |||
| ds = de.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||
| num_shards=rank_size, shard_id=rank_id) | |||
| data_set = ds.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||
| num_shards=rank_size, shard_id=rank_id) | |||
| elif platform == "GPU": | |||
| if do_train: | |||
| from mindspore.communication.management import get_rank, get_group_size | |||
| ds = de.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||
| num_shards=get_group_size(), shard_id=get_rank()) | |||
| data_set = ds.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||
| num_shards=get_group_size(), shard_id=get_rank()) | |||
| else: | |||
| ds = de.MindDataset( | |||
| data_set = ds.MindDataset( | |||
| dataset_path, num_parallel_workers=8, shuffle=True) | |||
| else: | |||
| raise ValueError("Unsupport platform.") | |||
| @@ -67,7 +67,7 @@ def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch | |||
| color_op = C.RandomColorAdjust( | |||
| brightness=0.4, contrast=0.4, saturation=0.4) | |||
| rescale_op = C.Rescale(1/255.0, 0) | |||
| rescale_op = C.Rescale(1 / 255.0, 0) | |||
| normalize_op = C.Normalize( | |||
| mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) | |||
| change_swap_op = C.HWC2CHW() | |||
| @@ -93,18 +93,18 @@ def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch | |||
| trans = composeop() | |||
| type_cast_op = C2.TypeCast(mstype.int32) | |||
| ds = ds.map(input_columns="image", operations=trans, | |||
| num_parallel_workers=8) | |||
| ds = ds.map(input_columns="label_list", | |||
| operations=type_cast_op, num_parallel_workers=8) | |||
| data_set = data_set.map(input_columns="image", operations=trans, | |||
| num_parallel_workers=8) | |||
| data_set = data_set.map(input_columns="label_list", | |||
| operations=type_cast_op, num_parallel_workers=8) | |||
| # apply shuffle operations | |||
| ds = ds.shuffle(buffer_size=buffer_size) | |||
| data_set = data_set.shuffle(buffer_size=buffer_size) | |||
| # apply batch operations | |||
| ds = ds.batch(batch_size, drop_remainder=True) | |||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||
| # apply dataset repeat operation | |||
| ds = ds.repeat(repeat_num) | |||
| data_set = data_set.repeat(repeat_num) | |||
| return ds | |||
| return data_set | |||
| @@ -17,7 +17,7 @@ create train or eval dataset. | |||
| """ | |||
| import os | |||
| import mindspore.common.dtype as mstype | |||
| import mindspore.dataset.engine as de | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.transforms.vision.c_transforms as C | |||
| import mindspore.dataset.transforms.vision.py_transforms as P | |||
| import mindspore.dataset.transforms.c_transforms as C2 | |||
| @@ -41,18 +41,18 @@ def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch | |||
| rank_size = int(os.getenv("RANK_SIZE")) | |||
| rank_id = int(os.getenv("RANK_ID")) | |||
| if rank_size == 1: | |||
| ds = de.MindDataset( | |||
| data_set = ds.MindDataset( | |||
| dataset_path, num_parallel_workers=8, shuffle=True) | |||
| else: | |||
| ds = de.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||
| num_shards=rank_size, shard_id=rank_id) | |||
| data_set = ds.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||
| num_shards=rank_size, shard_id=rank_id) | |||
| elif platform == "GPU": | |||
| if do_train: | |||
| from mindspore.communication.management import get_rank, get_group_size | |||
| ds = de.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||
| num_shards=get_group_size(), shard_id=get_rank()) | |||
| data_set = ds.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||
| num_shards=get_group_size(), shard_id=get_rank()) | |||
| else: | |||
| ds = de.MindDataset( | |||
| data_set = ds.MindDataset( | |||
| dataset_path, num_parallel_workers=8, shuffle=True) | |||
| else: | |||
| raise ValueError("Unsupport platform.") | |||
| @@ -67,7 +67,7 @@ def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch | |||
| color_op = C.RandomColorAdjust( | |||
| brightness=0.4, contrast=0.4, saturation=0.4) | |||
| rescale_op = C.Rescale(1/255.0, 0) | |||
| rescale_op = C.Rescale(1 / 255.0, 0) | |||
| normalize_op = C.Normalize( | |||
| mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) | |||
| change_swap_op = C.HWC2CHW() | |||
| @@ -93,18 +93,18 @@ def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch | |||
| trans = composeop() | |||
| type_cast_op = C2.TypeCast(mstype.int32) | |||
| ds = ds.map(input_columns="image", operations=trans, | |||
| num_parallel_workers=8) | |||
| ds = ds.map(input_columns="label_list", | |||
| operations=type_cast_op, num_parallel_workers=8) | |||
| data_set = data_set.map(input_columns="image", operations=trans, | |||
| num_parallel_workers=8) | |||
| data_set = data_set.map(input_columns="label_list", | |||
| operations=type_cast_op, num_parallel_workers=8) | |||
| # apply shuffle operations | |||
| ds = ds.shuffle(buffer_size=buffer_size) | |||
| data_set = data_set.shuffle(buffer_size=buffer_size) | |||
| # apply batch operations | |||
| ds = ds.batch(batch_size, drop_remainder=True) | |||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||
| # apply dataset repeat operation | |||
| ds = ds.repeat(repeat_num) | |||
| data_set = data_set.repeat(repeat_num) | |||
| return ds | |||
| return data_set | |||
| @@ -17,7 +17,7 @@ create train or eval dataset. | |||
| """ | |||
| import os | |||
| import mindspore.common.dtype as mstype | |||
| import mindspore.dataset.engine as de | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.vision.c_transforms as C | |||
| import mindspore.dataset.vision.py_transforms as P | |||
| import mindspore.dataset.transforms.c_transforms as C2 | |||
| @@ -42,18 +42,18 @@ def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch | |||
| rank_size = int(os.getenv("RANK_SIZE")) | |||
| rank_id = int(os.getenv("RANK_ID")) | |||
| if rank_size == 1: | |||
| ds = de.MindDataset( | |||
| data_set = ds.MindDataset( | |||
| dataset_path, num_parallel_workers=8, shuffle=True) | |||
| else: | |||
| ds = de.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||
| num_shards=rank_size, shard_id=rank_id) | |||
| data_set = ds.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||
| num_shards=rank_size, shard_id=rank_id) | |||
| elif platform == "GPU": | |||
| if do_train: | |||
| from mindspore.communication.management import get_rank, get_group_size | |||
| ds = de.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||
| num_shards=get_group_size(), shard_id=get_rank()) | |||
| data_set = ds.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||
| num_shards=get_group_size(), shard_id=get_rank()) | |||
| else: | |||
| ds = de.MindDataset( | |||
| data_set = ds.MindDataset( | |||
| dataset_path, num_parallel_workers=8, shuffle=False) | |||
| else: | |||
| raise ValueError("Unsupport platform.") | |||
| @@ -68,7 +68,7 @@ def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch | |||
| color_op = C.RandomColorAdjust( | |||
| brightness=0.4, contrast=0.4, saturation=0.4) | |||
| rescale_op = C.Rescale(1/255.0, 0) | |||
| rescale_op = C.Rescale(1 / 255.0, 0) | |||
| normalize_op = C.Normalize( | |||
| mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) | |||
| change_swap_op = C.HWC2CHW() | |||
| @@ -88,18 +88,18 @@ def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch | |||
| trans = composeop | |||
| type_cast_op = C2.TypeCast(mstype.int32) | |||
| ds = ds.map(input_columns="image", operations=trans, | |||
| num_parallel_workers=8) | |||
| ds = ds.map(input_columns="label_list", | |||
| operations=type_cast_op, num_parallel_workers=8) | |||
| data_set = data_set.map(input_columns="image", operations=trans, | |||
| num_parallel_workers=8) | |||
| data_set = data_set.map(input_columns="label_list", | |||
| operations=type_cast_op, num_parallel_workers=8) | |||
| # apply shuffle operations | |||
| ds = ds.shuffle(buffer_size=buffer_size) | |||
| data_set = data_set.shuffle(buffer_size=buffer_size) | |||
| # apply batch operations | |||
| ds = ds.batch(batch_size, drop_remainder=True) | |||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||
| # apply dataset repeat operation | |||
| ds = ds.repeat(repeat_num) | |||
| data_set = data_set.repeat(repeat_num) | |||
| return ds | |||
| return data_set | |||
| @@ -17,7 +17,7 @@ create train or eval dataset. | |||
| """ | |||
| import os | |||
| import mindspore.common.dtype as mstype | |||
| import mindspore.dataset.engine as de | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.vision.c_transforms as C | |||
| import mindspore.dataset.transforms.c_transforms as C2 | |||
| from mindspore.communication.management import init, get_rank, get_group_size | |||
| @@ -48,15 +48,15 @@ def create_dataset_cifar(dataset_path, | |||
| device_num = get_group_size() | |||
| if device_num == 1: | |||
| ds = de.Cifar10Dataset(dataset_path, | |||
| num_parallel_workers=8, | |||
| shuffle=True) | |||
| data_set = ds.Cifar10Dataset(dataset_path, | |||
| num_parallel_workers=8, | |||
| shuffle=True) | |||
| else: | |||
| ds = de.Cifar10Dataset(dataset_path, | |||
| num_parallel_workers=8, | |||
| shuffle=True, | |||
| num_shards=device_num, | |||
| shard_id=rank_id) | |||
| data_set = ds.Cifar10Dataset(dataset_path, | |||
| num_parallel_workers=8, | |||
| shuffle=True, | |||
| num_shards=device_num, | |||
| shard_id=rank_id) | |||
| # define map operations | |||
| if do_train: | |||
| @@ -80,20 +80,20 @@ def create_dataset_cifar(dataset_path, | |||
| type_cast_op = C2.TypeCast(mstype.int32) | |||
| ds = ds.map(operations=type_cast_op, | |||
| input_columns="label", | |||
| num_parallel_workers=8) | |||
| ds = ds.map(operations=trans, | |||
| input_columns="image", | |||
| num_parallel_workers=8) | |||
| data_set = data_set.map(operations=type_cast_op, | |||
| input_columns="label", | |||
| num_parallel_workers=8) | |||
| data_set = data_set.map(operations=trans, | |||
| input_columns="image", | |||
| num_parallel_workers=8) | |||
| # apply batch operations | |||
| ds = ds.batch(batch_size, drop_remainder=True) | |||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||
| # apply dataset repeat operation | |||
| ds = ds.repeat(repeat_num) | |||
| data_set = data_set.repeat(repeat_num) | |||
| return ds | |||
| return data_set | |||
| def create_dataset_imagenet(dataset_path, | |||
| @@ -122,15 +122,15 @@ def create_dataset_imagenet(dataset_path, | |||
| device_num = get_group_size() | |||
| if device_num == 1: | |||
| ds = de.ImageFolderDataset(dataset_path, | |||
| num_parallel_workers=8, | |||
| shuffle=True) | |||
| data_set = ds.ImageFolderDataset(dataset_path, | |||
| num_parallel_workers=8, | |||
| shuffle=True) | |||
| else: | |||
| ds = de.ImageFolderDataset(dataset_path, | |||
| num_parallel_workers=8, | |||
| shuffle=True, | |||
| num_shards=device_num, | |||
| shard_id=rank_id) | |||
| data_set = ds.ImageFolderDataset(dataset_path, | |||
| num_parallel_workers=8, | |||
| shuffle=True, | |||
| num_shards=device_num, | |||
| shard_id=rank_id) | |||
| image_size = 227 | |||
| mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] | |||
| @@ -159,20 +159,20 @@ def create_dataset_imagenet(dataset_path, | |||
| type_cast_op = C2.TypeCast(mstype.int32) | |||
| ds = ds.map(operations=type_cast_op, | |||
| input_columns="label", | |||
| num_parallel_workers=8) | |||
| ds = ds.map(operations=trans, | |||
| input_columns="image", | |||
| num_parallel_workers=8) | |||
| data_set = data_set.map(operations=type_cast_op, | |||
| input_columns="label", | |||
| num_parallel_workers=8) | |||
| data_set = data_set.map(operations=trans, | |||
| input_columns="image", | |||
| num_parallel_workers=8) | |||
| # apply batch operations | |||
| ds = ds.batch(batch_size, drop_remainder=True) | |||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||
| # apply dataset repeat operation | |||
| ds = ds.repeat(repeat_num) | |||
| data_set = data_set.repeat(repeat_num) | |||
| return ds | |||
| return data_set | |||
| def _get_rank_info(): | |||
| @@ -21,7 +21,7 @@ from enum import Enum | |||
| import numpy as np | |||
| import pandas as pd | |||
| import mindspore.dataset.engine as de | |||
| import mindspore.dataset as ds | |||
| import mindspore.common.dtype as mstype | |||
| from .config import DataConfig | |||
| @@ -142,8 +142,8 @@ class H5Dataset(): | |||
| X_id = X[:, 0:self.max_length] | |||
| X_va = X[:, self.max_length:] | |||
| yield np.array(X_id.astype(dtype=np.int32)), \ | |||
| np.array(X_va.astype(dtype=np.float32)), \ | |||
| np.array(y.astype(dtype=np.float32)) | |||
| np.array(X_va.astype(dtype=np.float32)), \ | |||
| np.array(y.astype(dtype=np.float32)) | |||
| def _get_h5_dataset(directory, train_mode=True, epochs=1, batch_size=1000): | |||
| @@ -172,9 +172,9 @@ def _get_h5_dataset(directory, train_mode=True, epochs=1, batch_size=1000): | |||
| for _ in range(0, numbers_of_batch, 1): | |||
| yield train_eval_gen.__next__() | |||
| ds = de.GeneratorDataset(_iter_h5_data, ["ids", "weights", "labels"]) | |||
| ds = ds.repeat(epochs) | |||
| return ds | |||
| data_set = ds.GeneratorDataset(_iter_h5_data, ["ids", "weights", "labels"]) | |||
| data_set = data_set.repeat(epochs) | |||
| return data_set | |||
| def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=1000, | |||
| @@ -199,23 +199,23 @@ def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=100 | |||
| shuffle = train_mode | |||
| if rank_size is not None and rank_id is not None: | |||
| ds = de.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), | |||
| columns_list=['feat_ids', 'feat_vals', 'label'], | |||
| num_shards=rank_size, shard_id=rank_id, shuffle=shuffle, | |||
| num_parallel_workers=8) | |||
| data_set = ds.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), | |||
| columns_list=['feat_ids', 'feat_vals', 'label'], | |||
| num_shards=rank_size, shard_id=rank_id, shuffle=shuffle, | |||
| num_parallel_workers=8) | |||
| else: | |||
| ds = de.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), | |||
| columns_list=['feat_ids', 'feat_vals', 'label'], | |||
| shuffle=shuffle, num_parallel_workers=8) | |||
| ds = ds.batch(int(batch_size / line_per_sample), drop_remainder=True) | |||
| ds = ds.map(operations=(lambda x, y, z: (np.array(x).flatten().reshape(batch_size, 39), | |||
| np.array(y).flatten().reshape(batch_size, 39), | |||
| np.array(z).flatten().reshape(batch_size, 1))), | |||
| input_columns=['feat_ids', 'feat_vals', 'label'], | |||
| column_order=['feat_ids', 'feat_vals', 'label'], | |||
| num_parallel_workers=8) | |||
| ds = ds.repeat(epochs) | |||
| return ds | |||
| data_set = ds.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), | |||
| columns_list=['feat_ids', 'feat_vals', 'label'], | |||
| shuffle=shuffle, num_parallel_workers=8) | |||
| data_set = data_set.batch(int(batch_size / line_per_sample), drop_remainder=True) | |||
| data_set = data_set.map(operations=(lambda x, y, z: (np.array(x).flatten().reshape(batch_size, 39), | |||
| np.array(y).flatten().reshape(batch_size, 39), | |||
| np.array(z).flatten().reshape(batch_size, 1))), | |||
| input_columns=['feat_ids', 'feat_vals', 'label'], | |||
| column_order=['feat_ids', 'feat_vals', 'label'], | |||
| num_parallel_workers=8) | |||
| data_set = data_set.repeat(epochs) | |||
| return data_set | |||
| def _get_tf_dataset(directory, train_mode=True, epochs=1, batch_size=1000, | |||
| @@ -242,28 +242,28 @@ def _get_tf_dataset(directory, train_mode=True, epochs=1, batch_size=1000, | |||
| for filename in filenames: | |||
| if file_prefixt_name in filename and 'tfrecord' in filename: | |||
| dataset_files.append(os.path.join(dir_path, filename)) | |||
| schema = de.Schema() | |||
| schema = ds.Schema() | |||
| schema.add_column('feat_ids', de_type=mstype.int32) | |||
| schema.add_column('feat_vals', de_type=mstype.float32) | |||
| schema.add_column('label', de_type=mstype.float32) | |||
| if rank_size is not None and rank_id is not None: | |||
| ds = de.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, | |||
| schema=schema, num_parallel_workers=8, | |||
| num_shards=rank_size, shard_id=rank_id, | |||
| shard_equal_rows=True) | |||
| data_set = ds.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, | |||
| schema=schema, num_parallel_workers=8, | |||
| num_shards=rank_size, shard_id=rank_id, | |||
| shard_equal_rows=True) | |||
| else: | |||
| ds = de.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, | |||
| schema=schema, num_parallel_workers=8) | |||
| ds = ds.batch(int(batch_size / line_per_sample), drop_remainder=True) | |||
| ds = ds.map(operations=(lambda x, y, z: ( | |||
| data_set = ds.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, | |||
| schema=schema, num_parallel_workers=8) | |||
| data_set = data_set.batch(int(batch_size / line_per_sample), drop_remainder=True) | |||
| data_set = data_set.map(operations=(lambda x, y, z: ( | |||
| np.array(x).flatten().reshape(batch_size, 39), | |||
| np.array(y).flatten().reshape(batch_size, 39), | |||
| np.array(z).flatten().reshape(batch_size, 1))), | |||
| input_columns=['feat_ids', 'feat_vals', 'label'], | |||
| column_order=['feat_ids', 'feat_vals', 'label'], | |||
| num_parallel_workers=8) | |||
| ds = ds.repeat(epochs) | |||
| return ds | |||
| input_columns=['feat_ids', 'feat_vals', 'label'], | |||
| column_order=['feat_ids', 'feat_vals', 'label'], | |||
| num_parallel_workers=8) | |||
| data_set = data_set.repeat(epochs) | |||
| return data_set | |||
| def create_dataset(directory, train_mode=True, epochs=1, batch_size=1000, | |||
| @@ -21,7 +21,7 @@ from enum import Enum | |||
| import pandas as pd | |||
| import numpy as np | |||
| import mindspore.dataset.engine as de | |||
| import mindspore.dataset as ds | |||
| import mindspore.common.dtype as mstype | |||
| from .config import DataConfig | |||
| @@ -142,8 +142,8 @@ class H5Dataset(): | |||
| X_id = X[:, 0:self.max_length] | |||
| X_va = X[:, self.max_length:] | |||
| yield np.array(X_id.astype(dtype=np.int32)), \ | |||
| np.array(X_va.astype(dtype=np.float32)), \ | |||
| np.array(y.astype(dtype=np.float32)) | |||
| np.array(X_va.astype(dtype=np.float32)), \ | |||
| np.array(y.astype(dtype=np.float32)) | |||
| def _get_h5_dataset(directory, train_mode=True, epochs=1, batch_size=1000): | |||
| @@ -172,9 +172,9 @@ def _get_h5_dataset(directory, train_mode=True, epochs=1, batch_size=1000): | |||
| for _ in range(0, numbers_of_batch, 1): | |||
| yield train_eval_gen.__next__() | |||
| ds = de.GeneratorDataset(_iter_h5_data, ["ids", "weights", "labels"], num_samples=3000) | |||
| ds = ds.repeat(epochs) | |||
| return ds | |||
| data_set = ds.GeneratorDataset(_iter_h5_data, ["ids", "weights", "labels"], num_samples=3000) | |||
| data_set = data_set.repeat(epochs) | |||
| return data_set | |||
| def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=1000, | |||
| @@ -199,23 +199,23 @@ def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=100 | |||
| shuffle = train_mode | |||
| if rank_size is not None and rank_id is not None: | |||
| ds = de.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), | |||
| columns_list=['feat_ids', 'feat_vals', 'label'], | |||
| num_shards=rank_size, shard_id=rank_id, shuffle=shuffle, | |||
| num_parallel_workers=8) | |||
| data_set = ds.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), | |||
| columns_list=['feat_ids', 'feat_vals', 'label'], | |||
| num_shards=rank_size, shard_id=rank_id, shuffle=shuffle, | |||
| num_parallel_workers=8) | |||
| else: | |||
| ds = de.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), | |||
| columns_list=['feat_ids', 'feat_vals', 'label'], | |||
| shuffle=shuffle, num_parallel_workers=8) | |||
| ds = ds.batch(int(batch_size / line_per_sample), drop_remainder=True) | |||
| ds = ds.map(operations=(lambda x, y, z: (np.array(x).flatten().reshape(batch_size, 39), | |||
| np.array(y).flatten().reshape(batch_size, 39), | |||
| np.array(z).flatten().reshape(batch_size, 1))), | |||
| input_columns=['feat_ids', 'feat_vals', 'label'], | |||
| column_order=['feat_ids', 'feat_vals', 'label'], | |||
| num_parallel_workers=8) | |||
| ds = ds.repeat(epochs) | |||
| return ds | |||
| data_set = ds.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), | |||
| columns_list=['feat_ids', 'feat_vals', 'label'], | |||
| shuffle=shuffle, num_parallel_workers=8) | |||
| data_set = data_set.batch(int(batch_size / line_per_sample), drop_remainder=True) | |||
| data_set = data_set.map(operations=(lambda x, y, z: (np.array(x).flatten().reshape(batch_size, 39), | |||
| np.array(y).flatten().reshape(batch_size, 39), | |||
| np.array(z).flatten().reshape(batch_size, 1))), | |||
| input_columns=['feat_ids', 'feat_vals', 'label'], | |||
| column_order=['feat_ids', 'feat_vals', 'label'], | |||
| num_parallel_workers=8) | |||
| data_set = data_set.repeat(epochs) | |||
| return data_set | |||
| def _get_tf_dataset(directory, train_mode=True, epochs=1, batch_size=1000, | |||
| @@ -242,28 +242,28 @@ def _get_tf_dataset(directory, train_mode=True, epochs=1, batch_size=1000, | |||
| for filename in filenames: | |||
| if file_prefixt_name in filename and 'tfrecord' in filename: | |||
| dataset_files.append(os.path.join(dir_path, filename)) | |||
| schema = de.Schema() | |||
| schema = ds.Schema() | |||
| schema.add_column('feat_ids', de_type=mstype.int32) | |||
| schema.add_column('feat_vals', de_type=mstype.float32) | |||
| schema.add_column('label', de_type=mstype.float32) | |||
| if rank_size is not None and rank_id is not None: | |||
| ds = de.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, | |||
| schema=schema, num_parallel_workers=8, | |||
| num_shards=rank_size, shard_id=rank_id, | |||
| shard_equal_rows=True, num_samples=3000) | |||
| data_set = ds.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, | |||
| schema=schema, num_parallel_workers=8, | |||
| num_shards=rank_size, shard_id=rank_id, | |||
| shard_equal_rows=True, num_samples=3000) | |||
| else: | |||
| ds = de.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, | |||
| schema=schema, num_parallel_workers=8, num_samples=3000) | |||
| ds = ds.batch(int(batch_size / line_per_sample), drop_remainder=True) | |||
| ds = ds.map(operations=(lambda x, y, z: ( | |||
| data_set = ds.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, | |||
| schema=schema, num_parallel_workers=8, num_samples=3000) | |||
| data_set = data_set.batch(int(batch_size / line_per_sample), drop_remainder=True) | |||
| data_set = data_set.map(operations=(lambda x, y, z: ( | |||
| np.array(x).flatten().reshape(batch_size, 39), | |||
| np.array(y).flatten().reshape(batch_size, 39), | |||
| np.array(z).flatten().reshape(batch_size, 1))), | |||
| input_columns=['feat_ids', 'feat_vals', 'label'], | |||
| column_order=['feat_ids', 'feat_vals', 'label'], | |||
| num_parallel_workers=8) | |||
| ds = ds.repeat(epochs) | |||
| return ds | |||
| input_columns=['feat_ids', 'feat_vals', 'label'], | |||
| column_order=['feat_ids', 'feat_vals', 'label'], | |||
| num_parallel_workers=8) | |||
| data_set = data_set.repeat(epochs) | |||
| return data_set | |||
| def create_dataset(directory, train_mode=True, epochs=1, batch_size=1000, | |||
| @@ -24,17 +24,18 @@ from mindspore.nn.optim import Adam | |||
| from mindspore.train.model import Model | |||
| from mindspore.train.loss_scale_manager import DynamicLossScaleManager | |||
| from mindspore.train.callback import Callback | |||
| import mindspore.dataset.engine as de | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.transforms.c_transforms as deC | |||
| from mindspore import context | |||
| from model_zoo.official.nlp.transformer.src.transformer_model import TransformerConfig | |||
| from model_zoo.official.nlp.transformer.src.transformer_for_train import TransformerNetworkWithLoss, \ | |||
| TransformerTrainOneStepWithLossScaleCell | |||
| TransformerTrainOneStepWithLossScaleCell | |||
| from model_zoo.official.nlp.transformer.src.config import cfg, transformer_net_cfg | |||
| from model_zoo.official.nlp.transformer.src.lr_schedule import create_dynamic_lr | |||
| DATA_DIR = ["/home/workspace/mindspore_dataset/transformer/test-mindrecord"] | |||
| def get_config(version='base', batch_size=1): | |||
| """get config""" | |||
| if version == 'large': | |||
| @@ -75,23 +76,25 @@ def get_config(version='base', batch_size=1): | |||
| transformer_cfg = TransformerConfig(batch_size=batch_size) | |||
| return transformer_cfg | |||
| def load_test_data(batch_size=1, data_file=None): | |||
| """Load test dataset.""" | |||
| ds = de.MindDataset(data_file, | |||
| columns_list=["source_eos_ids", "source_eos_mask", | |||
| "target_sos_ids", "target_sos_mask", | |||
| "target_eos_ids", "target_eos_mask"], | |||
| shuffle=False) | |||
| data_set = ds.MindDataset(data_file, | |||
| columns_list=["source_eos_ids", "source_eos_mask", | |||
| "target_sos_ids", "target_sos_mask", | |||
| "target_eos_ids", "target_eos_mask"], | |||
| shuffle=False) | |||
| type_cast_op = deC.TypeCast(mstype.int32) | |||
| ds = ds.map(operations=type_cast_op, input_columns="source_eos_ids") | |||
| ds = ds.map(operations=type_cast_op, input_columns="source_eos_mask") | |||
| ds = ds.map(operations=type_cast_op, input_columns="target_sos_ids") | |||
| ds = ds.map(operations=type_cast_op, input_columns="target_sos_mask") | |||
| ds = ds.map(operations=type_cast_op, input_columns="target_eos_ids") | |||
| ds = ds.map(operations=type_cast_op, input_columns="target_eos_mask") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="source_eos_ids") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="source_eos_mask") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="target_sos_ids") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="target_sos_mask") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="target_eos_ids") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="target_eos_mask") | |||
| # apply batch operations | |||
| ds = ds.batch(batch_size, drop_remainder=True) | |||
| return ds | |||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||
| return data_set | |||
| class ModelCallback(Callback): | |||
| def __init__(self): | |||
| @@ -107,13 +110,16 @@ class ModelCallback(Callback): | |||
| self.lossscale_list.append(cb_params.net_outputs[2].asnumpy()) | |||
| print("epoch: {}, outputs are: {}".format(cb_params.cur_epoch_num, str(cb_params.net_outputs))) | |||
| class TimeMonitor(Callback): | |||
| """Time Monitor.""" | |||
| def __init__(self, data_size): | |||
| super(TimeMonitor, self).__init__() | |||
| self.data_size = data_size | |||
| self.epoch_mseconds_list = [] | |||
| self.per_step_mseconds_list = [] | |||
| def epoch_begin(self, run_context): | |||
| self.epoch_time = time.time() | |||
| @@ -122,6 +128,7 @@ class TimeMonitor(Callback): | |||
| self.epoch_mseconds_list.append(epoch_mseconds) | |||
| self.per_step_mseconds_list.append(epoch_mseconds / self.data_size) | |||
| @pytest.mark.level0 | |||
| @pytest.mark.platform_arm_ascend_training | |||
| @pytest.mark.platform_x86_ascend_training | |||
| @@ -142,7 +149,7 @@ def test_transformer(): | |||
| netwithloss = TransformerNetworkWithLoss(config, True) | |||
| lr = Tensor(create_dynamic_lr(schedule="constant*rsqrt_hidden*linear_warmup*rsqrt_decay", | |||
| training_steps=dataset.get_dataset_size()*epoch_size, | |||
| training_steps=dataset.get_dataset_size() * epoch_size, | |||
| learning_rate=cfg.lr_schedule.learning_rate, | |||
| warmup_steps=cfg.lr_schedule.warmup_steps, | |||
| hidden_size=config.hidden_size), mstype.float32) | |||
| @@ -193,5 +200,6 @@ def test_transformer(): | |||
| print("per step mseconds: {}".format(per_step_mseconds)) | |||
| assert per_step_mseconds <= expect_per_step_mseconds + 2 | |||
| if __name__ == '__main__': | |||
| test_transformer() | |||
| @@ -14,13 +14,13 @@ | |||
| # ============================================================================ | |||
| """train_imagenet.""" | |||
| import os | |||
| from enum import Enum | |||
| import numpy as np | |||
| import mindspore.dataset.engine as de | |||
| import mindspore.dataset as ds | |||
| import mindspore.common.dtype as mstype | |||
| class DataType(Enum): | |||
| """ | |||
| Enumerate supported dataset format. | |||
| @@ -29,6 +29,7 @@ class DataType(Enum): | |||
| TFRECORD = 2 | |||
| H5 = 3 | |||
| def _get_tf_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000, | |||
| line_per_sample=1000, rank_size=None, rank_id=None): | |||
| """ | |||
| @@ -41,26 +42,29 @@ def _get_tf_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000, | |||
| for filename in filenames: | |||
| if file_prefix_name in filename and "tfrecord" in filename: | |||
| dataset_files.append(os.path.join(dirpath, filename)) | |||
| schema = de.Schema() | |||
| schema = ds.Schema() | |||
| schema.add_column('feat_ids', de_type=mstype.int32) | |||
| schema.add_column('feat_vals', de_type=mstype.float32) | |||
| schema.add_column('label', de_type=mstype.float32) | |||
| if rank_size is not None and rank_id is not None: | |||
| ds = de.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, schema=schema, num_parallel_workers=8, | |||
| num_shards=rank_size, shard_id=rank_id, shard_equal_rows=True) | |||
| data_set = ds.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, schema=schema, | |||
| num_parallel_workers=8, | |||
| num_shards=rank_size, shard_id=rank_id, shard_equal_rows=True) | |||
| else: | |||
| ds = de.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, schema=schema, num_parallel_workers=8) | |||
| ds = ds.batch(int(batch_size / line_per_sample), | |||
| drop_remainder=True) | |||
| ds = ds.map(operations=(lambda x, y, z: ( | |||
| data_set = ds.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, schema=schema, | |||
| num_parallel_workers=8) | |||
| data_set = data_set.batch(int(batch_size / line_per_sample), | |||
| drop_remainder=True) | |||
| data_set = data_set.map(operations=(lambda x, y, z: ( | |||
| np.array(x).flatten().reshape(batch_size, 39), | |||
| np.array(y).flatten().reshape(batch_size, 39), | |||
| np.array(z).flatten().reshape(batch_size, 1))), | |||
| input_columns=['feat_ids', 'feat_vals', 'label'], | |||
| column_order=['feat_ids', 'feat_vals', 'label'], num_parallel_workers=8) | |||
| #if train_mode: | |||
| ds = ds.repeat(epochs) | |||
| return ds | |||
| input_columns=['feat_ids', 'feat_vals', 'label'], | |||
| column_order=['feat_ids', 'feat_vals', 'label'], num_parallel_workers=8) | |||
| # if train_mode: | |||
| data_set = data_set.repeat(epochs) | |||
| return data_set | |||
| def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=1000, | |||
| line_per_sample=1000, rank_size=None, rank_id=None): | |||
| @@ -84,23 +88,23 @@ def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=100 | |||
| shuffle = train_mode | |||
| if rank_size is not None and rank_id is not None: | |||
| ds = de.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), | |||
| columns_list=['feat_ids', 'feat_vals', 'label'], | |||
| num_shards=rank_size, shard_id=rank_id, shuffle=shuffle, | |||
| num_parallel_workers=8) | |||
| data_set = ds.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), | |||
| columns_list=['feat_ids', 'feat_vals', 'label'], | |||
| num_shards=rank_size, shard_id=rank_id, shuffle=shuffle, | |||
| num_parallel_workers=8) | |||
| else: | |||
| ds = de.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), | |||
| columns_list=['feat_ids', 'feat_vals', 'label'], | |||
| shuffle=shuffle, num_parallel_workers=8) | |||
| ds = ds.batch(int(batch_size / line_per_sample), drop_remainder=True) | |||
| ds = ds.map(operations=(lambda x, y, z: (np.array(x).flatten().reshape(batch_size, 39), | |||
| np.array(y).flatten().reshape(batch_size, 39), | |||
| np.array(z).flatten().reshape(batch_size, 1))), | |||
| input_columns=['feat_ids', 'feat_vals', 'label'], | |||
| column_order=['feat_ids', 'feat_vals', 'label'], | |||
| num_parallel_workers=8) | |||
| ds = ds.repeat(epochs) | |||
| return ds | |||
| data_set = ds.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), | |||
| columns_list=['feat_ids', 'feat_vals', 'label'], | |||
| shuffle=shuffle, num_parallel_workers=8) | |||
| data_set = data_set.batch(int(batch_size / line_per_sample), drop_remainder=True) | |||
| data_set = data_set.map(operations=(lambda x, y, z: (np.array(x).flatten().reshape(batch_size, 39), | |||
| np.array(y).flatten().reshape(batch_size, 39), | |||
| np.array(z).flatten().reshape(batch_size, 1))), | |||
| input_columns=['feat_ids', 'feat_vals', 'label'], | |||
| column_order=['feat_ids', 'feat_vals', 'label'], | |||
| num_parallel_workers=8) | |||
| data_set = data_set.repeat(epochs) | |||
| return data_set | |||
| def create_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000, | |||
| @@ -20,7 +20,7 @@ import time | |||
| import numpy as np | |||
| import pytest | |||
| import mindspore.common.dtype as mstype | |||
| import mindspore.dataset.engine.datasets as de | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.transforms.c_transforms as C | |||
| from mindspore import context | |||
| from mindspore import log as logger | |||
| @@ -35,7 +35,6 @@ from model_zoo.official.nlp.bert.src.bert_for_pre_training import BertNetworkWit | |||
| from model_zoo.official.nlp.bert.src.bert_for_pre_training import BertTrainOneStepWithLossScaleCell | |||
| from model_zoo.official.nlp.bert.src.bert_model import BertConfig | |||
| _current_dir = os.path.dirname(os.path.realpath(__file__)) | |||
| DATA_DIR = ["/home/workspace/mindspore_dataset/bert/example/examples.tfrecord"] | |||
| SCHEMA_DIR = "/home/workspace/mindspore_dataset/bert/example/datasetSchema.json" | |||
| @@ -88,25 +87,26 @@ def me_de_train_dataset(sink_mode=False): | |||
| repeat_count = 1 | |||
| sink_size = -1 | |||
| batch_size = 16 | |||
| ds = de.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["input_ids", "input_mask", "segment_ids", | |||
| "next_sentence_labels", "masked_lm_positions", | |||
| "masked_lm_ids", "masked_lm_weights"], shuffle=False) | |||
| data_set = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["input_ids", "input_mask", "segment_ids", | |||
| "next_sentence_labels", "masked_lm_positions", | |||
| "masked_lm_ids", "masked_lm_weights"], | |||
| shuffle=False) | |||
| type_cast_op = C.TypeCast(mstype.int32) | |||
| new_repeat_count = repeat_count | |||
| if sink_mode: | |||
| sink_size = 100 | |||
| new_repeat_count = 3 | |||
| ds = ds.map(operations=type_cast_op, input_columns="masked_lm_ids") | |||
| ds = ds.map(operations=type_cast_op, input_columns="masked_lm_positions") | |||
| ds = ds.map(operations=type_cast_op, input_columns="next_sentence_labels") | |||
| ds = ds.map(operations=type_cast_op, input_columns="segment_ids") | |||
| ds = ds.map(operations=type_cast_op, input_columns="input_mask") | |||
| ds = ds.map(operations=type_cast_op, input_columns="input_ids") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_ids") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_positions") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="next_sentence_labels") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="input_mask") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="input_ids") | |||
| # apply batch operations | |||
| ds = ds.batch(batch_size, drop_remainder=True) | |||
| logger.info("data size: {}".format(ds.get_dataset_size())) | |||
| logger.info("repeat_count: {}".format(ds.get_repeat_count())) | |||
| return ds, new_repeat_count, sink_size | |||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||
| logger.info("data size: {}".format(data_set.get_dataset_size())) | |||
| logger.info("repeat_count: {}".format(data_set.get_repeat_count())) | |||
| return data_set, new_repeat_count, sink_size | |||
| def weight_variable(shape): | |||
| @@ -155,13 +155,16 @@ class ModelCallback(Callback): | |||
| self.lossscale_list.append(cb_params.net_outputs[2].asnumpy()) | |||
| print("epoch: {}, outputs are: {}".format(cb_params.cur_epoch_num, str(cb_params.net_outputs))) | |||
| class TimeMonitor(Callback): | |||
| """Time Monitor.""" | |||
| def __init__(self, data_size): | |||
| super(TimeMonitor, self).__init__() | |||
| self.data_size = data_size | |||
| self.epoch_mseconds_list = [] | |||
| self.per_step_mseconds_list = [] | |||
| def epoch_begin(self, run_context): | |||
| self.epoch_time = time.time() | |||
| @@ -178,7 +181,7 @@ class TimeMonitor(Callback): | |||
| def test_bert_performance(): | |||
| """test bert performance""" | |||
| context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", reserve_class_name_in_scope=False) | |||
| ds, new_repeat_count, sink_size = me_de_train_dataset(sink_mode=True) | |||
| data_set, new_repeat_count, sink_size = me_de_train_dataset(sink_mode=True) | |||
| version = os.getenv('VERSION', 'large') | |||
| config = get_config(version=version) | |||
| netwithloss = BertNetworkWithLoss(config, True) | |||
| @@ -221,7 +224,7 @@ def test_bert_performance(): | |||
| logger.info("***************** BERT param name is 3 {}".format(name)) | |||
| param.set_data(weight_variable(value.asnumpy().shape)) | |||
| time_monitor_callback = TimeMonitor(sink_size) | |||
| model.train(new_repeat_count, ds, callbacks=[time_monitor_callback, callback], | |||
| model.train(new_repeat_count, data_set, callbacks=[time_monitor_callback, callback], | |||
| dataset_sink_mode=True, sink_size=sink_size) | |||
| # assertion occurs while the loss value, overflow state or loss_scale value is wrong | |||
| @@ -250,5 +253,6 @@ def test_bert_performance(): | |||
| print("per step mseconds: {}".format(per_step_mseconds)) | |||
| assert per_step_mseconds <= expect_per_step_mseconds + 1 | |||
| if __name__ == '__main__': | |||
| test_bert_performance() | |||
| @@ -20,7 +20,7 @@ import time | |||
| from multiprocessing import Process, Queue | |||
| import pytest | |||
| import numpy as np | |||
| import mindspore.dataset as dataset | |||
| import mindspore.dataset as ds | |||
| import mindspore.common.dtype as mstype | |||
| import mindspore.communication.management as D | |||
| from mindspore import context | |||
| @@ -28,7 +28,6 @@ from mindspore import log as logger | |||
| from mindspore.train.callback import Callback | |||
| from mindspore.context import ParallelMode | |||
| from mindspore.train.serialization import load_checkpoint, load_param_into_net | |||
| import mindspore.dataset.engine.datasets as de | |||
| import mindspore.dataset.transforms.c_transforms as C | |||
| from model_zoo.official.nlp.bert_thor.src.bert_for_pre_training import BertNetworkWithLoss, BertTrainOneStepCell | |||
| from model_zoo.official.nlp.bert_thor.src.bert_net_config import bert_net_cfg | |||
| @@ -45,11 +44,13 @@ train_steps = 200 | |||
| batch_size = 12 | |||
| np.random.seed(1) | |||
| dataset.config.set_seed(1) | |||
| ds.config.set_seed(1) | |||
| os.environ['GLOG_v'] = str(2) | |||
| class TimeMonitor(Callback): | |||
| """Time Monitor.""" | |||
| def __init__(self, data_size): | |||
| super(TimeMonitor, self).__init__() | |||
| self.data_size = data_size | |||
| @@ -67,6 +68,7 @@ class TimeMonitor(Callback): | |||
| self.per_step_mseconds_list.append(per_step_mseconds) | |||
| print("epoch: {}, per_step_mseconds are {}".format(cb_params.cur_epoch_num, str(per_step_mseconds)), flush=True) | |||
| class LossCallback(Callback): | |||
| def __init__(self): | |||
| super(LossCallback, self).__init__() | |||
| @@ -78,6 +80,7 @@ class LossCallback(Callback): | |||
| print("epoch: {}, step: {}, outputs are {}".format(cb_params.cur_epoch_num, cb_params.cur_step_num, | |||
| str(cb_params.net_outputs)), flush=True) | |||
| def create_bert_dataset(device_num=1, rank=0, do_shuffle="true", data_dir=None, schema_dir=None): | |||
| """create train dataset""" | |||
| # apply repeat operations | |||
| @@ -87,25 +90,25 @@ def create_bert_dataset(device_num=1, rank=0, do_shuffle="true", data_dir=None, | |||
| if "tfrecord" in file_name: | |||
| data_files.append(os.path.join(data_dir, file_name)) | |||
| data_files = sorted(data_files) | |||
| ds = de.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None, | |||
| columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", | |||
| "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"], | |||
| shuffle=de.Shuffle.FILES if do_shuffle == "true" else False, | |||
| num_shards=device_num, shard_id=rank, shard_equal_rows=True) | |||
| ori_dataset_size = ds.get_dataset_size() | |||
| data_set = ds.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None, | |||
| columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", | |||
| "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"], | |||
| shuffle=ds.Shuffle.FILES if do_shuffle == "true" else False, | |||
| num_shards=device_num, shard_id=rank, shard_equal_rows=True) | |||
| ori_dataset_size = data_set.get_dataset_size() | |||
| print('origin dataset size: ', ori_dataset_size) | |||
| type_cast_op = C.TypeCast(mstype.int32) | |||
| ds = ds.map(operations=type_cast_op, input_columns="masked_lm_ids") | |||
| ds = ds.map(operations=type_cast_op, input_columns="masked_lm_positions") | |||
| ds = ds.map(operations=type_cast_op, input_columns="next_sentence_labels") | |||
| ds = ds.map(operations=type_cast_op, input_columns="segment_ids") | |||
| ds = ds.map(operations=type_cast_op, input_columns="input_mask") | |||
| ds = ds.map(operations=type_cast_op, input_columns="input_ids") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_ids") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_positions") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="next_sentence_labels") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="input_mask") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="input_ids") | |||
| # apply batch operations | |||
| ds = ds.batch(batch_size, drop_remainder=True) | |||
| logger.info("data size: {}".format(ds.get_dataset_size())) | |||
| logger.info("repeat count: {}".format(ds.get_repeat_count())) | |||
| return ds | |||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||
| logger.info("data size: {}".format(data_set.get_dataset_size())) | |||
| logger.info("repeat count: {}".format(data_set.get_repeat_count())) | |||
| return data_set | |||
| def _set_bert_all_reduce_split(): | |||
| @@ -151,13 +154,13 @@ def train_process_bert_thor(q, device_id, epoch_size, device_num): | |||
| device_num=device_num) | |||
| bert_net_cfg.num_hidden_layers = 4 | |||
| ds = create_bert_dataset(device_num=device_num, rank=rank, do_shuffle=False, data_dir=DATASET_PATH, schema_dir=None) | |||
| data_set = create_bert_dataset(device_num=device_num, rank=rank, do_shuffle=False, data_dir=DATASET_PATH, | |||
| schema_dir=None) | |||
| net_with_loss = BertNetworkWithLoss(bert_net_cfg, True) | |||
| new_repeat_count = epoch_size * ds.get_dataset_size() // data_sink_steps | |||
| new_repeat_count = epoch_size * data_set.get_dataset_size() // data_sink_steps | |||
| new_repeat_count = min(new_repeat_count, train_steps // data_sink_steps) | |||
| lr = get_bert_lr() | |||
| damping = get_bert_damping() | |||
| optimizer = THOR(filter(lambda x: x.requires_grad, net_with_loss.get_parameters()), lr, cfg.Thor.momentum, | |||
| @@ -175,7 +178,7 @@ def train_process_bert_thor(q, device_id, epoch_size, device_num): | |||
| net_with_grads = BertTrainOneStepCell(net_with_loss, optimizer=optimizer) | |||
| model = Model(net_with_grads, frequency=cfg.Thor.frequency) | |||
| model.train(new_repeat_count, ds, callbacks=callback, dataset_sink_mode=True, sink_size=data_sink_steps) | |||
| model.train(new_repeat_count, data_set, callbacks=callback, dataset_sink_mode=True, sink_size=data_sink_steps) | |||
| loss_list = loss_callback.loss_list | |||
| per_step_mseconds = time_monitor_callback.per_step_mseconds_list | |||
| @@ -230,5 +233,6 @@ def test_bert_thor_mlperf_8p(): | |||
| assert mean_cost < 64.2 | |||
| assert mean_loss < 7.9 | |||
| if __name__ == '__main__': | |||
| test_bert_thor_mlperf_8p() | |||
| @@ -20,7 +20,7 @@ import time | |||
| import numpy as np | |||
| import pytest | |||
| import mindspore.common.dtype as mstype | |||
| import mindspore.dataset.engine.datasets as de | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.transforms.c_transforms as C | |||
| from mindspore import context | |||
| from mindspore import log as logger | |||
| @@ -87,25 +87,26 @@ def me_de_train_dataset(sink_mode=False): | |||
| repeat_count = 1 | |||
| sink_size = -1 | |||
| batch_size = 16 | |||
| ds = de.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["input_ids", "input_mask", "segment_ids", | |||
| "next_sentence_labels", "masked_lm_positions", | |||
| "masked_lm_ids", "masked_lm_weights"], shuffle=False) | |||
| data_set = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["input_ids", "input_mask", "segment_ids", | |||
| "next_sentence_labels", "masked_lm_positions", | |||
| "masked_lm_ids", "masked_lm_weights"], | |||
| shuffle=False) | |||
| type_cast_op = C.TypeCast(mstype.int32) | |||
| new_repeat_count = repeat_count | |||
| if sink_mode: | |||
| sink_size = 100 | |||
| new_repeat_count = 3 | |||
| ds = ds.map(operations=type_cast_op, input_columns="masked_lm_ids") | |||
| ds = ds.map(operations=type_cast_op, input_columns="masked_lm_positions") | |||
| ds = ds.map(operations=type_cast_op, input_columns="next_sentence_labels") | |||
| ds = ds.map(operations=type_cast_op, input_columns="segment_ids") | |||
| ds = ds.map(operations=type_cast_op, input_columns="input_mask") | |||
| ds = ds.map(operations=type_cast_op, input_columns="input_ids") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_ids") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_positions") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="next_sentence_labels") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="input_mask") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="input_ids") | |||
| # apply batch operations | |||
| ds = ds.batch(batch_size, drop_remainder=True) | |||
| logger.info("data size: {}".format(ds.get_dataset_size())) | |||
| logger.info("repeat_count: {}".format(ds.get_repeat_count())) | |||
| return ds, new_repeat_count, sink_size | |||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||
| logger.info("data size: {}".format(data_set.get_dataset_size())) | |||
| logger.info("repeat_count: {}".format(data_set.get_repeat_count())) | |||
| return data_set, new_repeat_count, sink_size | |||
| def weight_variable(shape): | |||
| @@ -178,11 +179,11 @@ def test_bert_percision(enable_graph_kernel=False): | |||
| context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", reserve_class_name_in_scope=False) | |||
| if enable_graph_kernel: | |||
| context.set_context(enable_graph_kernel=True) | |||
| ds, new_repeat_count, _ = me_de_train_dataset() | |||
| data_set, new_repeat_count, _ = me_de_train_dataset() | |||
| version = os.getenv('VERSION', 'large') | |||
| config = get_config(version=version) | |||
| netwithloss = BertNetworkWithLoss(config, True) | |||
| lr = BertLearningRate(decay_steps=ds.get_dataset_size() * new_repeat_count, | |||
| lr = BertLearningRate(decay_steps=data_set.get_dataset_size() * new_repeat_count, | |||
| learning_rate=5e-5, end_learning_rate=1e-9, | |||
| power=10.0, warmup_steps=0) | |||
| decay_filter = lambda x: 'layernorm' not in x.name.lower() and 'bias' not in x.name.lower() | |||
| @@ -218,7 +219,7 @@ def test_bert_percision(enable_graph_kernel=False): | |||
| else: | |||
| logger.info("***************** BERT param name is 3 {}".format(name)) | |||
| param.set_data(weight_variable(value.asnumpy().shape)) | |||
| model.train(new_repeat_count, ds, callbacks=callback, dataset_sink_mode=False) | |||
| model.train(new_repeat_count, data_set, callbacks=callback, dataset_sink_mode=False) | |||
| # assertion occurs while the loss value, overflow state or loss_scale value is wrong | |||
| loss_value = np.array(callback.loss_list) | |||
| @@ -17,7 +17,7 @@ Data operations, will be used in run_pretrain.py | |||
| """ | |||
| import os | |||
| import mindspore.common.dtype as mstype | |||
| import mindspore.dataset.engine.datasets as de | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.transforms.c_transforms as C | |||
| from mindspore import log as logger | |||
| from .config import bert_net_cfg | |||
| @@ -32,24 +32,24 @@ def create_bert_dataset(epoch_size=1, device_num=1, rank=0, do_shuffle="true", d | |||
| for file_name in files: | |||
| if "tfrecord" in file_name: | |||
| data_files.append(os.path.join(data_dir, file_name)) | |||
| ds = de.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None, | |||
| columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", | |||
| "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"], | |||
| shuffle=(do_shuffle == "true"), num_shards=device_num, shard_id=rank, | |||
| shard_equal_rows=True) | |||
| ori_dataset_size = ds.get_dataset_size() | |||
| data_set = ds.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None, | |||
| columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", | |||
| "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"], | |||
| shuffle=(do_shuffle == "true"), num_shards=device_num, shard_id=rank, | |||
| shard_equal_rows=True) | |||
| ori_dataset_size = data_set.get_dataset_size() | |||
| print('origin dataset size: ', ori_dataset_size) | |||
| new_repeat_count = int(repeat_count * ori_dataset_size // ds.get_dataset_size()) | |||
| new_repeat_count = int(repeat_count * ori_dataset_size // data_set.get_dataset_size()) | |||
| type_cast_op = C.TypeCast(mstype.int32) | |||
| ds = ds.map(operations=type_cast_op, input_columns="masked_lm_ids") | |||
| ds = ds.map(operations=type_cast_op, input_columns="masked_lm_positions") | |||
| ds = ds.map(operations=type_cast_op, input_columns="next_sentence_labels") | |||
| ds = ds.map(operations=type_cast_op, input_columns="segment_ids") | |||
| ds = ds.map(operations=type_cast_op, input_columns="input_mask") | |||
| ds = ds.map(operations=type_cast_op, input_columns="input_ids") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_ids") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_positions") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="next_sentence_labels") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="input_mask") | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="input_ids") | |||
| # apply batch operations | |||
| ds = ds.batch(bert_net_cfg.batch_size, drop_remainder=True) | |||
| ds = ds.repeat(max(new_repeat_count, repeat_count)) | |||
| logger.info("data size: {}".format(ds.get_dataset_size())) | |||
| logger.info("repeatcount: {}".format(ds.get_repeat_count())) | |||
| return ds, new_repeat_count | |||
| data_set = data_set.batch(bert_net_cfg.batch_size, drop_remainder=True) | |||
| data_set = data_set.repeat(max(new_repeat_count, repeat_count)) | |||
| logger.info("data size: {}".format(data_set.get_dataset_size())) | |||
| logger.info("repeatcount: {}".format(data_set.get_repeat_count())) | |||
| return data_set, new_repeat_count | |||
| @@ -17,7 +17,7 @@ | |||
| import os | |||
| import mindspore.common.dtype as mstype | |||
| import mindspore.dataset.engine as de | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.vision.c_transforms as C | |||
| import mindspore.dataset.transforms.c_transforms as C2 | |||
| @@ -39,10 +39,10 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32): | |||
| device_num = int(os.getenv("RANK_SIZE")) | |||
| rank_id = int(os.getenv("RANK_ID")) | |||
| if device_num == 1: | |||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||
| else: | |||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||
| num_shards=device_num, shard_id=rank_id) | |||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||
| num_shards=device_num, shard_id=rank_id) | |||
| image_size = 224 | |||
| mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] | |||
| @@ -65,15 +65,14 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32): | |||
| C.HWC2CHW() | |||
| ] | |||
| type_cast_op = C2.TypeCast(mstype.int32) | |||
| ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8) | |||
| ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) | |||
| data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8) | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) | |||
| # apply batch operations | |||
| ds = ds.batch(batch_size, drop_remainder=True) | |||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||
| # apply dataset repeat operation | |||
| ds = ds.repeat(repeat_num) | |||
| return ds | |||
| data_set = data_set.repeat(repeat_num) | |||
| return data_set | |||
| @@ -18,12 +18,11 @@ | |||
| import os | |||
| import mindspore.common.dtype as mstype | |||
| import mindspore.dataset as dataset | |||
| import mindspore.dataset.engine as de | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.transforms.c_transforms as C2 | |||
| import mindspore.dataset.vision.c_transforms as C | |||
| dataset.config.set_seed(1) | |||
| ds.config.set_seed(1) | |||
| def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32): | |||
| @@ -43,10 +42,10 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32): | |||
| device_num = int(os.getenv("RANK_SIZE")) | |||
| rank_id = int(os.getenv("RANK_ID")) | |||
| if device_num == 1: | |||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) | |||
| else: | |||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||
| num_shards=device_num, shard_id=rank_id) | |||
| data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, | |||
| num_shards=device_num, shard_id=rank_id) | |||
| image_size = 224 | |||
| mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] | |||
| @@ -71,12 +70,12 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32): | |||
| type_cast_op = C2.TypeCast(mstype.int32) | |||
| ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8) | |||
| ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) | |||
| data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8) | |||
| data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) | |||
| # apply batch operations | |||
| ds = ds.batch(batch_size, drop_remainder=True) | |||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||
| # apply dataset repeat operation | |||
| ds = ds.repeat(repeat_num) | |||
| return ds | |||
| data_set = data_set.repeat(repeat_num) | |||
| return data_set | |||
| @@ -14,11 +14,10 @@ | |||
| # ============================================================================ | |||
| """ create train dataset. """ | |||
| from functools import partial | |||
| import mindspore.common.dtype as mstype | |||
| import mindspore.dataset.engine as de | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.transforms.c_transforms as C2 | |||
| import mindspore.dataset.vision.c_transforms as C | |||
| @@ -37,8 +36,8 @@ def create_dataset(dataset_path, config, repeat_num=1, batch_size=32): | |||
| dataset | |||
| """ | |||
| load_func = partial(de.Cifar10Dataset, dataset_path) | |||
| ds = load_func(num_parallel_workers=8, shuffle=False) | |||
| load_func = partial(ds.Cifar10Dataset, dataset_path) | |||
| data_set = load_func(num_parallel_workers=8, shuffle=False) | |||
| resize_height = config.image_height | |||
| resize_width = config.image_width | |||
| @@ -54,15 +53,15 @@ def create_dataset(dataset_path, config, repeat_num=1, batch_size=32): | |||
| type_cast_op = C2.TypeCast(mstype.int32) | |||
| ds = ds.map(operations=c_trans, input_columns="image", | |||
| num_parallel_workers=8) | |||
| ds = ds.map(operations=type_cast_op, | |||
| input_columns="label", num_parallel_workers=8) | |||
| data_set = data_set.map(operations=c_trans, input_columns="image", | |||
| num_parallel_workers=8) | |||
| data_set = data_set.map(operations=type_cast_op, | |||
| input_columns="label", num_parallel_workers=8) | |||
| # apply batch operations | |||
| ds = ds.batch(batch_size, drop_remainder=True) | |||
| data_set = data_set.batch(batch_size, drop_remainder=True) | |||
| # apply dataset repeat operation | |||
| ds = ds.repeat(repeat_num) | |||
| data_set = data_set.repeat(repeat_num) | |||
| return ds | |||
| return data_set | |||
| @@ -16,7 +16,7 @@ | |||
| Testing AutoContrast op in DE | |||
| """ | |||
| import numpy as np | |||
| import mindspore.dataset.engine as de | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.transforms.py_transforms | |||
| import mindspore.dataset.vision.py_transforms as F | |||
| import mindspore.dataset.vision.c_transforms as C | |||
| @@ -36,13 +36,13 @@ def test_auto_contrast_py(plot=False): | |||
| logger.info("Test AutoContrast Python Op") | |||
| # Original Images | |||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| transforms_original = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), | |||
| F.Resize((224, 224)), | |||
| F.ToTensor()]) | |||
| ds_original = ds.map(operations=transforms_original, input_columns="image") | |||
| ds_original = data_set.map(operations=transforms_original, input_columns="image") | |||
| ds_original = ds_original.batch(512) | |||
| @@ -55,7 +55,7 @@ def test_auto_contrast_py(plot=False): | |||
| axis=0) | |||
| # AutoContrast Images | |||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| transforms_auto_contrast = \ | |||
| mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), | |||
| @@ -63,7 +63,7 @@ def test_auto_contrast_py(plot=False): | |||
| F.AutoContrast(cutoff=10.0, ignore=[10, 20]), | |||
| F.ToTensor()]) | |||
| ds_auto_contrast = ds.map(operations=transforms_auto_contrast, input_columns="image") | |||
| ds_auto_contrast = data_set.map(operations=transforms_auto_contrast, input_columns="image") | |||
| ds_auto_contrast = ds_auto_contrast.batch(512) | |||
| @@ -96,15 +96,15 @@ def test_auto_contrast_c(plot=False): | |||
| logger.info("Test AutoContrast C Op") | |||
| # AutoContrast Images | |||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| ds = ds.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"]) | |||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"]) | |||
| python_op = F.AutoContrast(cutoff=10.0, ignore=[10, 20]) | |||
| c_op = C.AutoContrast(cutoff=10.0, ignore=[10, 20]) | |||
| transforms_op = mindspore.dataset.transforms.py_transforms.Compose([lambda img: F.ToPIL()(img.astype(np.uint8)), | |||
| python_op, | |||
| np.array]) | |||
| ds_auto_contrast_py = ds.map(operations=transforms_op, input_columns="image") | |||
| ds_auto_contrast_py = data_set.map(operations=transforms_op, input_columns="image") | |||
| ds_auto_contrast_py = ds_auto_contrast_py.batch(512) | |||
| @@ -116,10 +116,10 @@ def test_auto_contrast_c(plot=False): | |||
| image.asnumpy(), | |||
| axis=0) | |||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| ds = ds.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"]) | |||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"]) | |||
| ds_auto_contrast_c = ds.map(operations=c_op, input_columns="image") | |||
| ds_auto_contrast_c = data_set.map(operations=c_op, input_columns="image") | |||
| ds_auto_contrast_c = ds_auto_contrast_c.batch(512) | |||
| @@ -153,8 +153,8 @@ def test_auto_contrast_one_channel_c(plot=False): | |||
| logger.info("Test AutoContrast C Op With One Channel Images") | |||
| # AutoContrast Images | |||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| ds = ds.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"]) | |||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"]) | |||
| python_op = F.AutoContrast() | |||
| c_op = C.AutoContrast() | |||
| # not using F.ToTensor() since it converts to floats | |||
| @@ -164,7 +164,7 @@ def test_auto_contrast_one_channel_c(plot=False): | |||
| python_op, | |||
| np.array]) | |||
| ds_auto_contrast_py = ds.map(operations=transforms_op, input_columns="image") | |||
| ds_auto_contrast_py = data_set.map(operations=transforms_op, input_columns="image") | |||
| ds_auto_contrast_py = ds_auto_contrast_py.batch(512) | |||
| @@ -176,11 +176,11 @@ def test_auto_contrast_one_channel_c(plot=False): | |||
| image.asnumpy(), | |||
| axis=0) | |||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| ds = ds.map(operations=[C.Decode(), C.Resize((224, 224)), lambda img: np.array(img[:, :, 0])], | |||
| input_columns=["image"]) | |||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224)), lambda img: np.array(img[:, :, 0])], | |||
| input_columns=["image"]) | |||
| ds_auto_contrast_c = ds.map(operations=c_op, input_columns="image") | |||
| ds_auto_contrast_c = data_set.map(operations=c_op, input_columns="image") | |||
| ds_auto_contrast_c = ds_auto_contrast_c.batch(512) | |||
| @@ -208,9 +208,9 @@ def test_auto_contrast_mnist_c(plot=False): | |||
| Test AutoContrast C op with MNIST dataset (Grayscale images) | |||
| """ | |||
| logger.info("Test AutoContrast C Op With MNIST Images") | |||
| ds = de.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False) | |||
| ds_auto_contrast_c = ds.map(operations=C.AutoContrast(cutoff=1, ignore=(0, 255)), input_columns="image") | |||
| ds_orig = de.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False) | |||
| data_set = ds.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False) | |||
| ds_auto_contrast_c = data_set.map(operations=C.AutoContrast(cutoff=1, ignore=(0, 255)), input_columns="image") | |||
| ds_orig = ds.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False) | |||
| images = [] | |||
| images_trans = [] | |||
| @@ -236,21 +236,21 @@ def test_auto_contrast_invalid_ignore_param_c(): | |||
| """ | |||
| logger.info("Test AutoContrast C Op with invalid ignore parameter") | |||
| try: | |||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| ds = ds.map(operations=[C.Decode(), | |||
| C.Resize((224, 224)), | |||
| lambda img: np.array(img[:, :, 0])], input_columns=["image"]) | |||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data_set = data_set.map(operations=[C.Decode(), | |||
| C.Resize((224, 224)), | |||
| lambda img: np.array(img[:, :, 0])], input_columns=["image"]) | |||
| # invalid ignore | |||
| ds = ds.map(operations=C.AutoContrast(ignore=255.5), input_columns="image") | |||
| data_set = data_set.map(operations=C.AutoContrast(ignore=255.5), input_columns="image") | |||
| except TypeError as error: | |||
| logger.info("Got an exception in DE: {}".format(str(error))) | |||
| assert "Argument ignore with value 255.5 is not of type" in str(error) | |||
| try: | |||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| ds = ds.map(operations=[C.Decode(), C.Resize((224, 224)), | |||
| lambda img: np.array(img[:, :, 0])], input_columns=["image"]) | |||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224)), | |||
| lambda img: np.array(img[:, :, 0])], input_columns=["image"]) | |||
| # invalid ignore | |||
| ds = ds.map(operations=C.AutoContrast(ignore=(10, 100)), input_columns="image") | |||
| data_set = data_set.map(operations=C.AutoContrast(ignore=(10, 100)), input_columns="image") | |||
| except TypeError as error: | |||
| logger.info("Got an exception in DE: {}".format(str(error))) | |||
| assert "Argument ignore with value (10,100) is not of type" in str(error) | |||
| @@ -262,22 +262,22 @@ def test_auto_contrast_invalid_cutoff_param_c(): | |||
| """ | |||
| logger.info("Test AutoContrast C Op with invalid cutoff parameter") | |||
| try: | |||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| ds = ds.map(operations=[C.Decode(), | |||
| C.Resize((224, 224)), | |||
| lambda img: np.array(img[:, :, 0])], input_columns=["image"]) | |||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data_set = data_set.map(operations=[C.Decode(), | |||
| C.Resize((224, 224)), | |||
| lambda img: np.array(img[:, :, 0])], input_columns=["image"]) | |||
| # invalid ignore | |||
| ds = ds.map(operations=C.AutoContrast(cutoff=-10.0), input_columns="image") | |||
| data_set = data_set.map(operations=C.AutoContrast(cutoff=-10.0), input_columns="image") | |||
| except ValueError as error: | |||
| logger.info("Got an exception in DE: {}".format(str(error))) | |||
| assert "Input cutoff is not within the required interval of (0 to 100)." in str(error) | |||
| try: | |||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| ds = ds.map(operations=[C.Decode(), | |||
| C.Resize((224, 224)), | |||
| lambda img: np.array(img[:, :, 0])], input_columns=["image"]) | |||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data_set = data_set.map(operations=[C.Decode(), | |||
| C.Resize((224, 224)), | |||
| lambda img: np.array(img[:, :, 0])], input_columns=["image"]) | |||
| # invalid ignore | |||
| ds = ds.map(operations=C.AutoContrast(cutoff=120.0), input_columns="image") | |||
| data_set = data_set.map(operations=C.AutoContrast(cutoff=120.0), input_columns="image") | |||
| except ValueError as error: | |||
| logger.info("Got an exception in DE: {}".format(str(error))) | |||
| assert "Input cutoff is not within the required interval of (0 to 100)." in str(error) | |||
| @@ -289,22 +289,24 @@ def test_auto_contrast_invalid_ignore_param_py(): | |||
| """ | |||
| logger.info("Test AutoContrast python Op with invalid ignore parameter") | |||
| try: | |||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| ds = ds.map(operations=[mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), | |||
| F.Resize((224, 224)), | |||
| F.AutoContrast(ignore=255.5), | |||
| F.ToTensor()])], | |||
| input_columns=["image"]) | |||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data_set = data_set.map(operations=[mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), | |||
| F.Resize((224, 224)), | |||
| F.AutoContrast( | |||
| ignore=255.5), | |||
| F.ToTensor()])], | |||
| input_columns=["image"]) | |||
| except TypeError as error: | |||
| logger.info("Got an exception in DE: {}".format(str(error))) | |||
| assert "Argument ignore with value 255.5 is not of type" in str(error) | |||
| try: | |||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| ds = ds.map(operations=[mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), | |||
| F.Resize((224, 224)), | |||
| F.AutoContrast(ignore=(10, 100)), | |||
| F.ToTensor()])], | |||
| input_columns=["image"]) | |||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data_set = data_set.map(operations=[mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), | |||
| F.Resize((224, 224)), | |||
| F.AutoContrast( | |||
| ignore=(10, 100)), | |||
| F.ToTensor()])], | |||
| input_columns=["image"]) | |||
| except TypeError as error: | |||
| logger.info("Got an exception in DE: {}".format(str(error))) | |||
| assert "Argument ignore with value (10,100) is not of type" in str(error) | |||
| @@ -316,18 +318,19 @@ def test_auto_contrast_invalid_cutoff_param_py(): | |||
| """ | |||
| logger.info("Test AutoContrast python Op with invalid cutoff parameter") | |||
| try: | |||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| ds = ds.map(operations=[mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), | |||
| F.Resize((224, 224)), | |||
| F.AutoContrast(cutoff=-10.0), | |||
| F.ToTensor()])], | |||
| input_columns=["image"]) | |||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data_set = data_set.map(operations=[mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), | |||
| F.Resize((224, 224)), | |||
| F.AutoContrast( | |||
| cutoff=-10.0), | |||
| F.ToTensor()])], | |||
| input_columns=["image"]) | |||
| except ValueError as error: | |||
| logger.info("Got an exception in DE: {}".format(str(error))) | |||
| assert "Input cutoff is not within the required interval of (0 to 100)." in str(error) | |||
| try: | |||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| ds = ds.map( | |||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data_set = data_set.map( | |||
| operations=[mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), | |||
| F.Resize((224, 224)), | |||
| F.AutoContrast(cutoff=120.0), | |||
| @@ -17,7 +17,7 @@ Testing Equalize op in DE | |||
| """ | |||
| import numpy as np | |||
| import mindspore.dataset.engine as de | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.transforms.py_transforms | |||
| import mindspore.dataset.vision.c_transforms as C | |||
| import mindspore.dataset.vision.py_transforms as F | |||
| @@ -37,13 +37,13 @@ def test_equalize_py(plot=False): | |||
| logger.info("Test Equalize") | |||
| # Original Images | |||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| transforms_original = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), | |||
| F.Resize((224, 224)), | |||
| F.ToTensor()]) | |||
| ds_original = ds.map(operations=transforms_original, input_columns="image") | |||
| ds_original = data_set.map(operations=transforms_original, input_columns="image") | |||
| ds_original = ds_original.batch(512) | |||
| @@ -56,14 +56,14 @@ def test_equalize_py(plot=False): | |||
| axis=0) | |||
| # Color Equalized Images | |||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| transforms_equalize = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), | |||
| F.Resize((224, 224)), | |||
| F.Equalize(), | |||
| F.ToTensor()]) | |||
| ds_equalize = ds.map(operations=transforms_equalize, input_columns="image") | |||
| ds_equalize = data_set.map(operations=transforms_equalize, input_columns="image") | |||
| ds_equalize = ds_equalize.batch(512) | |||
| @@ -92,11 +92,11 @@ def test_equalize_c(plot=False): | |||
| logger.info("Test Equalize cpp op") | |||
| # Original Images | |||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| transforms_original = [C.Decode(), C.Resize(size=[224, 224])] | |||
| ds_original = ds.map(operations=transforms_original, input_columns="image") | |||
| ds_original = data_set.map(operations=transforms_original, input_columns="image") | |||
| ds_original = ds_original.batch(512) | |||
| @@ -109,12 +109,12 @@ def test_equalize_c(plot=False): | |||
| axis=0) | |||
| # Equalize Images | |||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| transform_equalize = [C.Decode(), C.Resize(size=[224, 224]), | |||
| C.Equalize()] | |||
| ds_equalize = ds.map(operations=transform_equalize, input_columns="image") | |||
| ds_equalize = data_set.map(operations=transform_equalize, input_columns="image") | |||
| ds_equalize = ds_equalize.batch(512) | |||
| @@ -142,10 +142,10 @@ def test_equalize_py_c(plot=False): | |||
| logger.info("Test Equalize cpp and python op") | |||
| # equalize Images in cpp | |||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| ds = ds.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"]) | |||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"]) | |||
| ds_c_equalize = ds.map(operations=C.Equalize(), input_columns="image") | |||
| ds_c_equalize = data_set.map(operations=C.Equalize(), input_columns="image") | |||
| ds_c_equalize = ds_c_equalize.batch(512) | |||
| @@ -158,15 +158,15 @@ def test_equalize_py_c(plot=False): | |||
| axis=0) | |||
| # Equalize images in python | |||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| ds = ds.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"]) | |||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"]) | |||
| transforms_p_equalize = mindspore.dataset.transforms.py_transforms.Compose([lambda img: img.astype(np.uint8), | |||
| F.ToPIL(), | |||
| F.Equalize(), | |||
| np.array]) | |||
| ds_p_equalize = ds.map(operations=transforms_p_equalize, input_columns="image") | |||
| ds_p_equalize = data_set.map(operations=transforms_p_equalize, input_columns="image") | |||
| ds_p_equalize = ds_p_equalize.batch(512) | |||
| @@ -197,11 +197,11 @@ def test_equalize_one_channel(): | |||
| c_op = C.Equalize() | |||
| try: | |||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| ds = ds.map(operations=[C.Decode(), C.Resize((224, 224)), | |||
| lambda img: np.array(img[:, :, 0])], input_columns=["image"]) | |||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224)), | |||
| lambda img: np.array(img[:, :, 0])], input_columns=["image"]) | |||
| ds.map(operations=c_op, input_columns="image") | |||
| data_set.map(operations=c_op, input_columns="image") | |||
| except RuntimeError as e: | |||
| logger.info("Got an exception in DE: {}".format(str(e))) | |||
| @@ -213,9 +213,9 @@ def test_equalize_mnist_c(plot=False): | |||
| Test Equalize C op with MNIST dataset (Grayscale images) | |||
| """ | |||
| logger.info("Test Equalize C Op With MNIST Images") | |||
| ds = de.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False) | |||
| ds_equalize_c = ds.map(operations=C.Equalize(), input_columns="image") | |||
| ds_orig = de.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False) | |||
| data_set = ds.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False) | |||
| ds_equalize_c = data_set.map(operations=C.Equalize(), input_columns="image") | |||
| ds_orig = ds.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False) | |||
| images = [] | |||
| images_trans = [] | |||
| @@ -242,7 +242,7 @@ def test_equalize_md5_py(): | |||
| logger.info("Test Equalize") | |||
| # First dataset | |||
| data1 = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data1 = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| transforms = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), | |||
| F.Equalize(), | |||
| F.ToTensor()]) | |||
| @@ -260,14 +260,14 @@ def test_equalize_md5_c(): | |||
| logger.info("Test Equalize cpp op with md5 check") | |||
| # Generate dataset | |||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| transforms_equalize = [C.Decode(), | |||
| C.Resize(size=[224, 224]), | |||
| C.Equalize(), | |||
| F.ToTensor()] | |||
| data = ds.map(operations=transforms_equalize, input_columns="image") | |||
| data = data_set.map(operations=transforms_equalize, input_columns="image") | |||
| # Compare with expected md5 from images | |||
| filename = "equalize_01_result_c.npz" | |||
| save_and_check_md5(data, filename, generate_golden=GENERATE_GOLDEN) | |||
| @@ -17,7 +17,7 @@ Testing Invert op in DE | |||
| """ | |||
| import numpy as np | |||
| import mindspore.dataset.engine as de | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.transforms.py_transforms | |||
| import mindspore.dataset.vision.py_transforms as F | |||
| import mindspore.dataset.vision.c_transforms as C | |||
| @@ -36,13 +36,13 @@ def test_invert_py(plot=False): | |||
| logger.info("Test Invert Python op") | |||
| # Original Images | |||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| transforms_original = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), | |||
| F.Resize((224, 224)), | |||
| F.ToTensor()]) | |||
| ds_original = ds.map(operations=transforms_original, input_columns="image") | |||
| ds_original = data_set.map(operations=transforms_original, input_columns="image") | |||
| ds_original = ds_original.batch(512) | |||
| @@ -55,14 +55,14 @@ def test_invert_py(plot=False): | |||
| axis=0) | |||
| # Color Inverted Images | |||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| transforms_invert = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), | |||
| F.Resize((224, 224)), | |||
| F.Invert(), | |||
| F.ToTensor()]) | |||
| ds_invert = ds.map(operations=transforms_invert, input_columns="image") | |||
| ds_invert = data_set.map(operations=transforms_invert, input_columns="image") | |||
| ds_invert = ds_invert.batch(512) | |||
| @@ -91,11 +91,11 @@ def test_invert_c(plot=False): | |||
| logger.info("Test Invert cpp op") | |||
| # Original Images | |||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| transforms_original = [C.Decode(), C.Resize(size=[224, 224])] | |||
| ds_original = ds.map(operations=transforms_original, input_columns="image") | |||
| ds_original = data_set.map(operations=transforms_original, input_columns="image") | |||
| ds_original = ds_original.batch(512) | |||
| @@ -108,12 +108,12 @@ def test_invert_c(plot=False): | |||
| axis=0) | |||
| # Invert Images | |||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| transform_invert = [C.Decode(), C.Resize(size=[224, 224]), | |||
| C.Invert()] | |||
| ds_invert = ds.map(operations=transform_invert, input_columns="image") | |||
| ds_invert = data_set.map(operations=transform_invert, input_columns="image") | |||
| ds_invert = ds_invert.batch(512) | |||
| @@ -141,10 +141,10 @@ def test_invert_py_c(plot=False): | |||
| logger.info("Test Invert cpp and python op") | |||
| # Invert Images in cpp | |||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| ds = ds.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"]) | |||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"]) | |||
| ds_c_invert = ds.map(operations=C.Invert(), input_columns="image") | |||
| ds_c_invert = data_set.map(operations=C.Invert(), input_columns="image") | |||
| ds_c_invert = ds_c_invert.batch(512) | |||
| @@ -157,15 +157,15 @@ def test_invert_py_c(plot=False): | |||
| axis=0) | |||
| # invert images in python | |||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| ds = ds.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"]) | |||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"]) | |||
| transforms_p_invert = mindspore.dataset.transforms.py_transforms.Compose([lambda img: img.astype(np.uint8), | |||
| F.ToPIL(), | |||
| F.Invert(), | |||
| np.array]) | |||
| ds_p_invert = ds.map(operations=transforms_p_invert, input_columns="image") | |||
| ds_p_invert = data_set.map(operations=transforms_p_invert, input_columns="image") | |||
| ds_p_invert = ds_p_invert.batch(512) | |||
| @@ -196,11 +196,11 @@ def test_invert_one_channel(): | |||
| c_op = C.Invert() | |||
| try: | |||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| ds = ds.map(operations=[C.Decode(), C.Resize((224, 224)), | |||
| lambda img: np.array(img[:, :, 0])], input_columns=["image"]) | |||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224)), | |||
| lambda img: np.array(img[:, :, 0])], input_columns=["image"]) | |||
| ds.map(operations=c_op, input_columns="image") | |||
| data_set.map(operations=c_op, input_columns="image") | |||
| except RuntimeError as e: | |||
| logger.info("Got an exception in DE: {}".format(str(e))) | |||
| @@ -214,13 +214,13 @@ def test_invert_md5_py(): | |||
| logger.info("Test Invert python op with md5 check") | |||
| # Generate dataset | |||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| transforms_invert = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), | |||
| F.Invert(), | |||
| F.ToTensor()]) | |||
| data = ds.map(operations=transforms_invert, input_columns="image") | |||
| data = data_set.map(operations=transforms_invert, input_columns="image") | |||
| # Compare with expected md5 from images | |||
| filename = "invert_01_result_py.npz" | |||
| save_and_check_md5(data, filename, generate_golden=GENERATE_GOLDEN) | |||
| @@ -233,14 +233,14 @@ def test_invert_md5_c(): | |||
| logger.info("Test Invert cpp op with md5 check") | |||
| # Generate dataset | |||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| transforms_invert = [C.Decode(), | |||
| C.Resize(size=[224, 224]), | |||
| C.Invert(), | |||
| F.ToTensor()] | |||
| data = ds.map(operations=transforms_invert, input_columns="image") | |||
| data = data_set.map(operations=transforms_invert, input_columns="image") | |||
| # Compare with expected md5 from images | |||
| filename = "invert_01_result_c.npz" | |||
| save_and_check_md5(data, filename, generate_golden=GENERATE_GOLDEN) | |||
| @@ -19,7 +19,6 @@ import numpy as np | |||
| import pytest | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.engine as de | |||
| import mindspore.dataset.transforms.py_transforms | |||
| import mindspore.dataset.vision.c_transforms as vision | |||
| import mindspore.dataset.vision.py_transforms as F | |||
| @@ -44,7 +43,7 @@ def test_random_color_py(degrees=(0.1, 1.9), plot=False): | |||
| logger.info("Test RandomColor") | |||
| # Original Images | |||
| data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| transforms_original = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), | |||
| F.Resize((224, 224)), | |||
| @@ -63,7 +62,7 @@ def test_random_color_py(degrees=(0.1, 1.9), plot=False): | |||
| axis=0) | |||
| # Random Color Adjusted Images | |||
| data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| transforms_random_color = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), | |||
| F.Resize((224, 224)), | |||
| @@ -146,7 +145,7 @@ def test_random_color_py_md5(): | |||
| original_num_parallel_workers = config_get_set_num_parallel_workers(1) | |||
| # Generate dataset | |||
| data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| transforms = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), | |||
| F.RandomColor((2.0, 2.5)), | |||
| @@ -234,7 +233,7 @@ def test_random_color_c_errors(): | |||
| assert "degrees must be a sequence with length 2." in str(error_info.value) | |||
| # RandomColor Cpp Op will fail with one channel input | |||
| mnist_ds = de.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False) | |||
| mnist_ds = ds.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False) | |||
| mnist_ds = mnist_ds.map(operations=vision.RandomColor(), input_columns="image") | |||
| with pytest.raises(RuntimeError) as error_info: | |||
| @@ -17,7 +17,6 @@ Testing RandomSharpness op in DE | |||
| """ | |||
| import numpy as np | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.engine as de | |||
| import mindspore.dataset.transforms.py_transforms | |||
| import mindspore.dataset.vision.py_transforms as F | |||
| import mindspore.dataset.vision.c_transforms as C | |||
| @@ -38,7 +37,7 @@ def test_random_sharpness_py(degrees=(0.7, 0.7), plot=False): | |||
| logger.info("Test RandomSharpness python op") | |||
| # Original Images | |||
| data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| transforms_original = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), | |||
| F.Resize((224, 224)), | |||
| @@ -57,7 +56,7 @@ def test_random_sharpness_py(degrees=(0.7, 0.7), plot=False): | |||
| axis=0) | |||
| # Random Sharpness Adjusted Images | |||
| data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| py_op = F.RandomSharpness() | |||
| if degrees is not None: | |||
| @@ -108,7 +107,7 @@ def test_random_sharpness_py_md5(): | |||
| transform = mindspore.dataset.transforms.py_transforms.Compose(transforms) | |||
| # Generate dataset | |||
| data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data = data.map(operations=transform, input_columns=["image"]) | |||
| # check results with md5 comparison | |||
| @@ -128,7 +127,7 @@ def test_random_sharpness_c(degrees=(1.6, 1.6), plot=False): | |||
| logger.info("Test RandomSharpness cpp op") | |||
| # Original Images | |||
| data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| transforms_original = [C.Decode(), | |||
| C.Resize((224, 224))] | |||
| @@ -146,7 +145,7 @@ def test_random_sharpness_c(degrees=(1.6, 1.6), plot=False): | |||
| axis=0) | |||
| # Random Sharpness Adjusted Images | |||
| data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| c_op = C.RandomSharpness() | |||
| if degrees is not None: | |||
| @@ -194,7 +193,7 @@ def test_random_sharpness_c_md5(): | |||
| ] | |||
| # Generate dataset | |||
| data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data = data.map(operations=transforms, input_columns=["image"]) | |||
| # check results with md5 comparison | |||
| @@ -213,7 +212,7 @@ def test_random_sharpness_c_py(degrees=(1.0, 1.0), plot=False): | |||
| logger.info("Test RandomSharpness C and python Op") | |||
| # RandomSharpness Images | |||
| data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data = data.map(operations=[C.Decode(), C.Resize((200, 300))], input_columns=["image"]) | |||
| python_op = F.RandomSharpness(degrees) | |||
| @@ -236,7 +235,7 @@ def test_random_sharpness_c_py(degrees=(1.0, 1.0), plot=False): | |||
| image, | |||
| axis=0) | |||
| data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data = data.map(operations=[C.Decode(), C.Resize((200, 300))], input_columns=["image"]) | |||
| ds_images_random_sharpness_c = data.map(operations=c_op, input_columns="image") | |||
| @@ -271,10 +270,10 @@ def test_random_sharpness_one_channel_c(degrees=(1.4, 1.4), plot=False): | |||
| if degrees is not None: | |||
| c_op = C.RandomSharpness(degrees) | |||
| # RandomSharpness Images | |||
| data = de.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False) | |||
| data = ds.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False) | |||
| ds_random_sharpness_c = data.map(operations=c_op, input_columns="image") | |||
| # Original images | |||
| data = de.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False) | |||
| data = ds.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False) | |||
| images = [] | |||
| images_trans = [] | |||
| @@ -296,7 +295,7 @@ def test_random_sharpness_invalid_params(): | |||
| """ | |||
| logger.info("Test RandomSharpness with invalid input parameters.") | |||
| try: | |||
| data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data = data.map(operations=[C.Decode(), C.Resize((224, 224)), | |||
| C.RandomSharpness(10)], input_columns=["image"]) | |||
| except TypeError as error: | |||
| @@ -304,7 +303,7 @@ def test_random_sharpness_invalid_params(): | |||
| assert "tuple" in str(error) | |||
| try: | |||
| data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data = data.map(operations=[C.Decode(), C.Resize((224, 224)), | |||
| C.RandomSharpness((-10, 10))], input_columns=["image"]) | |||
| except ValueError as error: | |||
| @@ -312,7 +311,7 @@ def test_random_sharpness_invalid_params(): | |||
| assert "interval" in str(error) | |||
| try: | |||
| data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data = data.map(operations=[C.Decode(), C.Resize((224, 224)), | |||
| C.RandomSharpness((10, 5))], input_columns=["image"]) | |||
| except ValueError as error: | |||
| @@ -17,7 +17,6 @@ Testing RandomSolarizeOp op in DE | |||
| """ | |||
| import pytest | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.engine as de | |||
| import mindspore.dataset.vision.c_transforms as vision | |||
| from mindspore import log as logger | |||
| from util import visualize_list, save_and_check_md5, config_get_set_seed, config_get_set_num_parallel_workers, \ | |||
| @@ -78,8 +77,8 @@ def test_random_solarize_mnist(plot=False, run_golden=True): | |||
| Test RandomSolarize op with MNIST dataset (Grayscale images) | |||
| """ | |||
| mnist_1 = de.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False) | |||
| mnist_2 = de.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False) | |||
| mnist_1 = ds.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False) | |||
| mnist_2 = ds.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False) | |||
| mnist_2 = mnist_2.map(operations=vision.RandomSolarize((0, 255)), input_columns="image") | |||
| images = [] | |||
| @@ -18,7 +18,7 @@ Testing UniformAugment in DE | |||
| import numpy as np | |||
| import pytest | |||
| import mindspore.dataset.engine as de | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.transforms.py_transforms | |||
| import mindspore.dataset.vision.c_transforms as C | |||
| import mindspore.dataset.vision.py_transforms as F | |||
| @@ -35,13 +35,13 @@ def test_uniform_augment(plot=False, num_ops=2): | |||
| logger.info("Test UniformAugment") | |||
| # Original Images | |||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| transforms_original = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), | |||
| F.Resize((224, 224)), | |||
| F.ToTensor()]) | |||
| ds_original = ds.map(operations=transforms_original, input_columns="image") | |||
| ds_original = data_set.map(operations=transforms_original, input_columns="image") | |||
| ds_original = ds_original.batch(512) | |||
| @@ -54,7 +54,7 @@ def test_uniform_augment(plot=False, num_ops=2): | |||
| axis=0) | |||
| # UniformAugment Images | |||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| transform_list = [F.RandomRotation(45), | |||
| F.RandomColor(), | |||
| @@ -70,7 +70,7 @@ def test_uniform_augment(plot=False, num_ops=2): | |||
| num_ops=num_ops), | |||
| F.ToTensor()]) | |||
| ds_ua = ds.map(operations=transforms_ua, input_columns="image") | |||
| ds_ua = data_set.map(operations=transforms_ua, input_columns="image") | |||
| ds_ua = ds_ua.batch(512) | |||
| @@ -99,12 +99,12 @@ def test_cpp_uniform_augment(plot=False, num_ops=2): | |||
| logger.info("Test CPP UniformAugment") | |||
| # Original Images | |||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| transforms_original = [C.Decode(), C.Resize(size=[224, 224]), | |||
| F.ToTensor()] | |||
| ds_original = ds.map(operations=transforms_original, input_columns="image") | |||
| ds_original = data_set.map(operations=transforms_original, input_columns="image") | |||
| ds_original = ds_original.batch(512) | |||
| @@ -117,7 +117,7 @@ def test_cpp_uniform_augment(plot=False, num_ops=2): | |||
| axis=0) | |||
| # UniformAugment Images | |||
| ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) | |||
| transforms_ua = [C.RandomCrop(size=[224, 224], padding=[32, 32, 32, 32]), | |||
| C.RandomHorizontalFlip(), | |||
| C.RandomVerticalFlip(), | |||
| @@ -130,7 +130,7 @@ def test_cpp_uniform_augment(plot=False, num_ops=2): | |||
| uni_aug, | |||
| F.ToTensor()] | |||
| ds_ua = ds.map(operations=transforms_all, input_columns="image", num_parallel_workers=1) | |||
| ds_ua = data_set.map(operations=transforms_all, input_columns="image", num_parallel_workers=1) | |||
| ds_ua = ds_ua.batch(512) | |||
| @@ -240,7 +240,7 @@ def test_cpp_uniform_augment_random_crop_badinput(num_ops=1): | |||
| logger.info("Test CPP UniformAugment with random_crop bad input") | |||
| batch_size = 2 | |||
| cifar10_dir = "../data/dataset/testCifar10Data" | |||
| ds1 = de.Cifar10Dataset(cifar10_dir, shuffle=False) # shape = [32,32,3] | |||
| ds1 = ds.Cifar10Dataset(cifar10_dir, shuffle=False) # shape = [32,32,3] | |||
| transforms_ua = [ | |||
| # Note: crop size [224, 224] > image size [32, 32] | |||