Browse Source

change code to import APIs from mindspore.dataset rather than mindspore.dataset.engine

tags/v1.1.0
Xiao Tianci 5 years ago
parent
commit
31fed1a2f6
57 changed files with 1137 additions and 1083 deletions
  1. +2
    -2
      model_zoo/official/cv/centerface/src/dataset.py
  2. +22
    -19
      model_zoo/official/cv/cnn_direction_model/src/dataset.py
  3. +6
    -6
      model_zoo/official/cv/crnn/src/dataset.py
  4. +11
    -11
      model_zoo/official/cv/inceptionv3/src/dataset.py
  5. +17
    -17
      model_zoo/official/cv/mobilenetv1/src/dataset.py
  6. +16
    -16
      model_zoo/official/cv/mobilenetv2/src/dataset.py
  7. +26
    -25
      model_zoo/official/cv/mobilenetv2_quant/src/dataset.py
  8. +11
    -11
      model_zoo/official/cv/mobilenetv3/src/dataset.py
  9. +9
    -9
      model_zoo/official/cv/nasnet/src/dataset.py
  10. +26
    -10
      model_zoo/official/cv/psenet/src/dataset.py
  11. +19
    -11
      model_zoo/official/cv/resnet/gpu_resnet_benchmark.py
  12. +33
    -33
      model_zoo/official/cv/resnet/src/dataset.py
  13. +20
    -19
      model_zoo/official/cv/resnet50_quant/src/dataset.py
  14. +9
    -9
      model_zoo/official/cv/resnet_thor/src/dataset.py
  15. +8
    -8
      model_zoo/official/cv/shufflenetv1/src/dataset.py
  16. +8
    -8
      model_zoo/official/cv/shufflenetv2/src/dataset.py
  17. +35
    -35
      model_zoo/official/cv/squeezenet/src/dataset.py
  18. +8
    -8
      model_zoo/official/cv/warpctc/src/dataset.py
  19. +9
    -8
      model_zoo/official/cv/xception/src/dataset.py
  20. +54
    -52
      model_zoo/official/nlp/bert/src/dataset.py
  21. +57
    -57
      model_zoo/official/nlp/bert_thor/src/dataset.py
  22. +7
    -7
      model_zoo/official/nlp/fasttext/eval.py
  23. +3
    -3
      model_zoo/official/nlp/fasttext/src/dataset.py
  24. +25
    -22
      model_zoo/official/nlp/fasttext/src/load_dataset.py
  25. +19
    -19
      model_zoo/official/nlp/gnmt_v2/src/dataset/load_dataset.py
  26. +15
    -15
      model_zoo/official/nlp/mass/src/dataset/load_dataset.py
  27. +15
    -15
      model_zoo/official/nlp/prophetnet/src/dataset/load_dataset.py
  28. +15
    -13
      model_zoo/official/nlp/tinybert/src/dataset.py
  29. +21
    -15
      model_zoo/official/nlp/transformer/eval.py
  30. +36
    -36
      model_zoo/official/recommend/deepfm/src/dataset.py
  31. +46
    -48
      model_zoo/official/recommend/wide_and_deep/src/datasets.py
  32. +2
    -2
      model_zoo/official/recommend/wide_and_deep/train_and_eval_auto_parallel.py
  33. +2
    -2
      model_zoo/official/recommend/wide_and_deep/train_and_eval_parameter_server_distribute.py
  34. +24
    -26
      model_zoo/official/recommend/wide_and_deep_multitable/src/datasets.py
  35. +34
    -37
      model_zoo/research/cv/centernet/src/dataset.py
  36. +16
    -16
      model_zoo/research/cv/ghostnet/src/dataset.py
  37. +16
    -16
      model_zoo/research/cv/ghostnet_quant/src/dataset.py
  38. +16
    -16
      model_zoo/research/cv/resnet50_adv_pruning/src/pet_dataset.py
  39. +35
    -35
      model_zoo/research/cv/squeezenet/src/dataset.py
  40. +36
    -36
      model_zoo/research/recommend/autodis/src/dataset.py
  41. +36
    -36
      tests/st/model_zoo_tests/DeepFM/src/dataset.py
  42. +24
    -16
      tests/st/model_zoo_tests/transformer/test_transformer.py
  43. +34
    -30
      tests/st/model_zoo_tests/wide_and_deep/python_file_for_ci/datasets.py
  44. +21
    -17
      tests/st/networks/models/bert/bert_performance/test_bert_tdt_lossscale.py
  45. +27
    -23
      tests/st/networks/models/bert/bert_performance/test_bert_thor_mlperf.py
  46. +18
    -17
      tests/st/networks/models/bert/bert_precision/test_bert_tdt_lossscale.py
  47. +19
    -19
      tests/st/networks/models/bert/src/dataset.py
  48. +9
    -10
      tests/st/networks/models/resnet50/src/dataset.py
  49. +10
    -11
      tests/st/networks/models/resnet50/src_thor/dataset.py
  50. +10
    -11
      tests/st/quantization/resnet50_quant/dataset.py
  51. +63
    -60
      tests/ut/python/dataset/test_autocontrast.py
  52. +25
    -25
      tests/ut/python/dataset/test_equalize.py
  53. +23
    -23
      tests/ut/python/dataset/test_invert.py
  54. +4
    -5
      tests/ut/python/dataset/test_random_color.py
  55. +13
    -14
      tests/ut/python/dataset/test_random_sharpness.py
  56. +2
    -3
      tests/ut/python/dataset/test_random_solarize_op.py
  57. +10
    -10
      tests/ut/python/dataset/test_uniform_augment.py

+ 2
- 2
model_zoo/official/cv/centerface/src/dataset.py View File

@@ -14,7 +14,7 @@
# ============================================================================ # ============================================================================
"""generate dataloader and data processing entry""" """generate dataloader and data processing entry"""
import mindspore.dataset.engine as de
import mindspore.dataset as ds
from src.utils import DistributedSampler from src.utils import DistributedSampler
@@ -32,7 +32,7 @@ def GetDataLoader(per_batch_size,
""" """
centerface_gen = CenterfaceDataset(config=config, split=split) centerface_gen = CenterfaceDataset(config=config, split=split)
sampler = DistributedSampler(centerface_gen, rank, group_size, shuffle=(split == 'train')) # user defined sampling strategy sampler = DistributedSampler(centerface_gen, rank, group_size, shuffle=(split == 'train')) # user defined sampling strategy
de_dataset = de.GeneratorDataset(centerface_gen, ["image", "anns"], sampler=sampler, num_parallel_workers=16)
de_dataset = ds.GeneratorDataset(centerface_gen, ["image", "anns"], sampler=sampler, num_parallel_workers=16)
if group_size > 1: if group_size > 1:
num_parallel_workers = 24 num_parallel_workers = 24


+ 22
- 19
model_zoo/official/cv/cnn_direction_model/src/dataset.py View File

@@ -17,7 +17,7 @@ Data operations, will be used in train.py and eval.py
""" """
import os import os


import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.vision.c_transforms as C import mindspore.dataset.vision.c_transforms as C
from src.dataset_utils import lucky, noise_blur, noise_speckle, noise_gamma, noise_gaussian, noise_salt_pepper, \ from src.dataset_utils import lucky, noise_blur, noise_speckle, noise_gamma, noise_gaussian, noise_salt_pepper, \
shift_color, enhance_brightness, enhance_sharpness, enhance_contrast, enhance_color, gaussian_blur, \ shift_color, enhance_brightness, enhance_sharpness, enhance_contrast, enhance_color, gaussian_blur, \
@@ -26,6 +26,7 @@ from src.dataset_utils import lucky, noise_blur, noise_speckle, noise_gamma, noi


import cv2 import cv2
import numpy as np import numpy as np

cv2.setNumThreads(0) cv2.setNumThreads(0)


image_height = None image_height = None
@@ -179,23 +180,24 @@ def create_dataset_train(mindrecord_file_pos, config):
rank_id = int(os.getenv("RANK_ID", '0')) rank_id = int(os.getenv("RANK_ID", '0'))
decode = C.Decode() decode = C.Decode()


ds = de.MindDataset(mindrecord_file_pos, columns_list=["image", "label"], num_parallel_workers=4,
num_shards=rank_size, shard_id=rank_id, shuffle=True)
ds = ds.map(operations=decode, input_columns=["image"], num_parallel_workers=8)
data_set = ds.MindDataset(mindrecord_file_pos, columns_list=["image", "label"], num_parallel_workers=4,
num_shards=rank_size, shard_id=rank_id, shuffle=True)
data_set = data_set.map(operations=decode, input_columns=["image"], num_parallel_workers=8)


augmentor = Augmentor(config.augment_severity, config.augment_prob) augmentor = Augmentor(config.augment_severity, config.augment_prob)
operation = augmentor.process operation = augmentor.process
ds = ds.map(operations=operation, input_columns=["image"],
num_parallel_workers=1, python_multiprocessing=True)
data_set = data_set.map(operations=operation, input_columns=["image"],
num_parallel_workers=1, python_multiprocessing=True)
##randomly augment half of samples to be negative samples ##randomly augment half of samples to be negative samples
ds = ds.map(operations=[random_neg_with_rotate, unify_img_label, transform_image], input_columns=["image", "label"],
num_parallel_workers=8, python_multiprocessing=True)
##for training double the dataset to accoun for positive and negative
ds = ds.repeat(2)
data_set = data_set.map(operations=[random_neg_with_rotate, unify_img_label, transform_image],
input_columns=["image", "label"],
num_parallel_workers=8, python_multiprocessing=True)
##for training double the data_set to accoun for positive and negative
data_set = data_set.repeat(2)


# apply batch operations # apply batch operations
ds = ds.batch(config.batch_size, drop_remainder=True)
return ds
data_set = data_set.batch(config.batch_size, drop_remainder=True)
return data_set




def resize_image(img, label): def resize_image(img, label):
@@ -230,17 +232,18 @@ def create_dataset_eval(mindrecord_file_pos, config):
rank_id = int(os.getenv("RANK_ID", '0')) rank_id = int(os.getenv("RANK_ID", '0'))
decode = C.Decode() decode = C.Decode()


ds = de.MindDataset(mindrecord_file_pos, columns_list=["image", "label"], num_parallel_workers=1,
num_shards=rank_size, shard_id=rank_id, shuffle=False)
ds = ds.map(operations=decode, input_columns=["image"], num_parallel_workers=8)
data_set = ds.MindDataset(mindrecord_file_pos, columns_list=["image", "label"], num_parallel_workers=1,
num_shards=rank_size, shard_id=rank_id, shuffle=False)
data_set = data_set.map(operations=decode, input_columns=["image"], num_parallel_workers=8)


global image_height global image_height
global image_width global image_width
image_height = config.im_size_h image_height = config.im_size_h
image_width = config.im_size_w image_width = config.im_size_w
ds = ds.map(operations=resize_image, input_columns=["image", "label"], num_parallel_workers=config.work_nums,
python_multiprocessing=False)
data_set = data_set.map(operations=resize_image, input_columns=["image", "label"],
num_parallel_workers=config.work_nums,
python_multiprocessing=False)
# apply batch operations # apply batch operations
ds = ds.batch(1, drop_remainder=True)
data_set = data_set.batch(1, drop_remainder=True)


return ds
return data_set

+ 6
- 6
model_zoo/official/cv/crnn/src/dataset.py View File

@@ -16,7 +16,7 @@
import os import os
import numpy as np import numpy as np
import mindspore.common.dtype as mstype import mindspore.common.dtype as mstype
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.transforms.c_transforms as C import mindspore.dataset.transforms.c_transforms as C
import mindspore.dataset.vision.c_transforms as vc import mindspore.dataset.vision.c_transforms as vc
from PIL import Image, ImageFile from PIL import Image, ImageFile
@@ -105,7 +105,7 @@ def create_dataset(name, dataset_path, batch_size=1, num_shards=1, shard_id=0, i
dataset = IIIT5KDataset(dataset_path, "annotation.txt", config) dataset = IIIT5KDataset(dataset_path, "annotation.txt", config)
else: else:
raise ValueError(f"unsupported dataset name: {name}") raise ValueError(f"unsupported dataset name: {name}")
ds = de.GeneratorDataset(dataset, ["image", "label"], shuffle=True, num_shards=num_shards, shard_id=shard_id)
data_set = ds.GeneratorDataset(dataset, ["image", "label"], shuffle=True, num_shards=num_shards, shard_id=shard_id)
image_trans = [ image_trans = [
vc.Resize((config.image_height, config.image_width)), vc.Resize((config.image_height, config.image_width)),
vc.Normalize([127.5, 127.5, 127.5], std=[127.5, 127.5, 127.5]), vc.Normalize([127.5, 127.5, 127.5], std=[127.5, 127.5, 127.5]),
@@ -114,8 +114,8 @@ def create_dataset(name, dataset_path, batch_size=1, num_shards=1, shard_id=0, i
label_trans = [ label_trans = [
C.TypeCast(mstype.int32) C.TypeCast(mstype.int32)
] ]
ds = ds.map(operations=image_trans, input_columns=["image"], num_parallel_workers=8)
ds = ds.map(operations=label_trans, input_columns=["label"], num_parallel_workers=8)
data_set = data_set.map(operations=image_trans, input_columns=["image"], num_parallel_workers=8)
data_set = data_set.map(operations=label_trans, input_columns=["label"], num_parallel_workers=8)


ds = ds.batch(batch_size, drop_remainder=True)
return ds
data_set = data_set.batch(batch_size, drop_remainder=True)
return data_set

+ 11
- 11
model_zoo/official/cv/inceptionv3/src/dataset.py View File

@@ -16,7 +16,7 @@
Data operations, will be used in train.py and eval.py Data operations, will be used in train.py and eval.py
""" """
import mindspore.common.dtype as mstype import mindspore.common.dtype as mstype
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.transforms.c_transforms as C2 import mindspore.dataset.transforms.c_transforms as C2
import mindspore.dataset.vision.c_transforms as C import mindspore.dataset.vision.c_transforms as C
from src.config import config_gpu as cfg from src.config import config_gpu as cfg
@@ -37,33 +37,33 @@ def create_dataset(dataset_path, do_train, rank, group_size, repeat_num=1):
dataset dataset
""" """
if group_size == 1: if group_size == 1:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=cfg.work_nums, shuffle=True)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=cfg.work_nums, shuffle=True)
else: else:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=cfg.work_nums, shuffle=True,
num_shards=group_size, shard_id=rank)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=cfg.work_nums, shuffle=True,
num_shards=group_size, shard_id=rank)
# define map operations # define map operations
if do_train: if do_train:
trans = [ trans = [
C.RandomCropDecodeResize(299, scale=(0.08, 1.0), ratio=(0.75, 1.333)), C.RandomCropDecodeResize(299, scale=(0.08, 1.0), ratio=(0.75, 1.333)),
C.RandomHorizontalFlip(prob=0.5), C.RandomHorizontalFlip(prob=0.5),
C.RandomColorAdjust(brightness=0.4, contrast=0.4, saturation=0.4) C.RandomColorAdjust(brightness=0.4, contrast=0.4, saturation=0.4)
]
]
else: else:
trans = [ trans = [
C.Decode(), C.Decode(),
C.Resize(299), C.Resize(299),
C.CenterCrop(299) C.CenterCrop(299)
]
]
trans += [ trans += [
C.Rescale(1.0 / 255.0, 0.0), C.Rescale(1.0 / 255.0, 0.0),
C.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), C.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
C.HWC2CHW() C.HWC2CHW()
] ]
type_cast_op = C2.TypeCast(mstype.int32) type_cast_op = C2.TypeCast(mstype.int32)
ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=cfg.work_nums)
ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=cfg.work_nums)
data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=cfg.work_nums)
data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=cfg.work_nums)
# apply batch operations # apply batch operations
ds = ds.batch(cfg.batch_size, drop_remainder=True)
data_set = data_set.batch(cfg.batch_size, drop_remainder=True)
# apply dataset repeat operation # apply dataset repeat operation
ds = ds.repeat(repeat_num)
return ds
data_set = data_set.repeat(repeat_num)
return data_set

+ 17
- 17
model_zoo/official/cv/mobilenetv1/src/dataset.py View File

@@ -17,7 +17,7 @@ create train or eval dataset.
""" """
import os import os
import mindspore.common.dtype as mstype import mindspore.common.dtype as mstype
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.vision.c_transforms as C import mindspore.dataset.vision.c_transforms as C
import mindspore.dataset.transforms.c_transforms as C2 import mindspore.dataset.transforms.c_transforms as C2
from mindspore.communication.management import init, get_rank, get_group_size from mindspore.communication.management import init, get_rank, get_group_size
@@ -44,10 +44,10 @@ def create_dataset1(dataset_path, do_train, repeat_num=1, batch_size=32, target=
device_num = get_group_size() device_num = get_group_size()


if device_num == 1: if device_num == 1:
ds = de.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True)
data_set = ds.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True)
else: else:
ds = de.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=device_num, shard_id=rank_id)
data_set = ds.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=device_num, shard_id=rank_id)


# define map operations # define map operations
trans = [] trans = []
@@ -66,15 +66,15 @@ def create_dataset1(dataset_path, do_train, repeat_num=1, batch_size=32, target=


type_cast_op = C2.TypeCast(mstype.int32) type_cast_op = C2.TypeCast(mstype.int32)


ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)
ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8)
data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)
data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8)


# apply batch operations # apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
data_set = data_set.batch(batch_size, drop_remainder=True)
# apply dataset repeat operation # apply dataset repeat operation
ds = ds.repeat(repeat_num)
data_set = data_set.repeat(repeat_num)


return ds
return data_set




def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend"): def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend"):
@@ -99,10 +99,10 @@ def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, target=
device_num = get_group_size() device_num = get_group_size()


if device_num == 1: if device_num == 1:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
else: else:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=device_num, shard_id=rank_id)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=device_num, shard_id=rank_id)


image_size = 224 image_size = 224
mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] mean = [0.485 * 255, 0.456 * 255, 0.406 * 255]
@@ -127,16 +127,16 @@ def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, target=


type_cast_op = C2.TypeCast(mstype.int32) type_cast_op = C2.TypeCast(mstype.int32)


ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8)
ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)
data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8)
data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)


# apply batch operations # apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
data_set = data_set.batch(batch_size, drop_remainder=True)


# apply dataset repeat operation # apply dataset repeat operation
ds = ds.repeat(repeat_num)
data_set = data_set.repeat(repeat_num)


return ds
return data_set




def _get_rank_info(): def _get_rank_info():


+ 16
- 16
model_zoo/official/cv/mobilenetv2/src/dataset.py View File

@@ -21,7 +21,7 @@ import numpy as np
from mindspore import Tensor from mindspore import Tensor
from mindspore.train.model import Model from mindspore.train.model import Model
import mindspore.common.dtype as mstype import mindspore.common.dtype as mstype
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.vision.c_transforms as C import mindspore.dataset.vision.c_transforms as C
import mindspore.dataset.transforms.c_transforms as C2 import mindspore.dataset.transforms.c_transforms as C2


@@ -43,22 +43,22 @@ def create_dataset(dataset_path, do_train, config, repeat_num=1):
rank_size = int(os.getenv("RANK_SIZE", '1')) rank_size = int(os.getenv("RANK_SIZE", '1'))
rank_id = int(os.getenv("RANK_ID", '0')) rank_id = int(os.getenv("RANK_ID", '0'))
if rank_size == 1: if rank_size == 1:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
else: else:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=rank_size, shard_id=rank_id)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=rank_size, shard_id=rank_id)
elif config.platform == "GPU": elif config.platform == "GPU":
if do_train: if do_train:
if config.run_distribute: if config.run_distribute:
from mindspore.communication.management import get_rank, get_group_size from mindspore.communication.management import get_rank, get_group_size
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=get_group_size(), shard_id=get_rank())
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=get_group_size(), shard_id=get_rank())
else: else:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
else: else:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
elif config.platform == "CPU": elif config.platform == "CPU":
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)


resize_height = config.image_height resize_height = config.image_height
resize_width = config.image_width resize_width = config.image_width
@@ -83,19 +83,19 @@ def create_dataset(dataset_path, do_train, config, repeat_num=1):


type_cast_op = C2.TypeCast(mstype.int32) type_cast_op = C2.TypeCast(mstype.int32)


ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8)
ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)
data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8)
data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)


# apply shuffle operations # apply shuffle operations
ds = ds.shuffle(buffer_size=buffer_size)
data_set = data_set.shuffle(buffer_size=buffer_size)


# apply batch operations # apply batch operations
ds = ds.batch(config.batch_size, drop_remainder=True)
data_set = data_set.batch(config.batch_size, drop_remainder=True)


# apply dataset repeat operation # apply dataset repeat operation
ds = ds.repeat(repeat_num)
data_set = data_set.repeat(repeat_num)


return ds
return data_set




def extract_features(net, dataset_path, config): def extract_features(net, dataset_path, config):
@@ -121,5 +121,5 @@ def extract_features(net, dataset_path, config):
features = model.predict(Tensor(image)) features = model.predict(Tensor(image))
np.save(features_path, features.asnumpy()) np.save(features_path, features.asnumpy())
np.save(label_path, label) np.save(label_path, label)
print(f"Complete the batch {i+1}/{step_size}")
print(f"Complete the batch {i + 1}/{step_size}")
return step_size return step_size

+ 26
- 25
model_zoo/official/cv/mobilenetv2_quant/src/dataset.py View File

@@ -18,7 +18,7 @@ create train or eval dataset.
import os import os
from functools import partial from functools import partial
import mindspore.common.dtype as mstype import mindspore.common.dtype as mstype
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.vision.c_transforms as C import mindspore.dataset.vision.c_transforms as C
import mindspore.dataset.transforms.c_transforms as C2 import mindspore.dataset.transforms.c_transforms as C2
import mindspore.dataset.transforms.py_transforms as P2 import mindspore.dataset.transforms.py_transforms as P2
@@ -43,24 +43,24 @@ def create_dataset(dataset_path, do_train, config, device_target, repeat_num=1,
rank_id = int(os.getenv("RANK_ID")) rank_id = int(os.getenv("RANK_ID"))
columns_list = ['image', 'label'] columns_list = ['image', 'label']
if config.data_load_mode == "mindrecord": if config.data_load_mode == "mindrecord":
load_func = partial(de.MindDataset, dataset_path, columns_list)
load_func = partial(ds.MindDataset, dataset_path, columns_list)
else: else:
load_func = partial(de.ImageFolderDataset, dataset_path)
load_func = partial(ds.ImageFolderDataset, dataset_path)
if do_train: if do_train:
if rank_size == 1: if rank_size == 1:
ds = load_func(num_parallel_workers=8, shuffle=True)
data_set = load_func(num_parallel_workers=8, shuffle=True)
else: else:
ds = load_func(num_parallel_workers=8, shuffle=True,
num_shards=rank_size, shard_id=rank_id)
data_set = load_func(num_parallel_workers=8, shuffle=True,
num_shards=rank_size, shard_id=rank_id)
else: else:
ds = load_func(num_parallel_workers=8, shuffle=False)
data_set = load_func(num_parallel_workers=8, shuffle=False)
elif device_target == "GPU": elif device_target == "GPU":
if do_train: if do_train:
from mindspore.communication.management import get_rank, get_group_size from mindspore.communication.management import get_rank, get_group_size
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=get_group_size(), shard_id=get_rank())
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=get_group_size(), shard_id=get_rank())
else: else:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
else: else:
raise ValueError("Unsupported device_target.") raise ValueError("Unsupported device_target.")


@@ -69,7 +69,7 @@ def create_dataset(dataset_path, do_train, config, device_target, repeat_num=1,
if do_train: if do_train:
buffer_size = 20480 buffer_size = 20480
# apply shuffle operations # apply shuffle operations
ds = ds.shuffle(buffer_size=buffer_size)
data_set = data_set.shuffle(buffer_size=buffer_size)


# define map operations # define map operations
decode_op = C.Decode() decode_op = C.Decode()
@@ -89,16 +89,16 @@ def create_dataset(dataset_path, do_train, config, device_target, repeat_num=1,


type_cast_op = C2.TypeCast(mstype.int32) type_cast_op = C2.TypeCast(mstype.int32)


ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=16)
ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)
data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=16)
data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)


# apply batch operations # apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
data_set = data_set.batch(batch_size, drop_remainder=True)


# apply dataset repeat operation # apply dataset repeat operation
ds = ds.repeat(repeat_num)
data_set = data_set.repeat(repeat_num)


return ds
return data_set




def create_dataset_py(dataset_path, do_train, config, device_target, repeat_num=1, batch_size=32): def create_dataset_py(dataset_path, do_train, config, device_target, repeat_num=1, batch_size=32):
@@ -119,12 +119,12 @@ def create_dataset_py(dataset_path, do_train, config, device_target, repeat_num=
rank_id = int(os.getenv("RANK_ID")) rank_id = int(os.getenv("RANK_ID"))
if do_train: if do_train:
if rank_size == 1: if rank_size == 1:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
else: else:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=rank_size, shard_id=rank_id)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=rank_size, shard_id=rank_id)
else: else:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=False)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=False)
else: else:
raise ValueError("Unsupported device target.") raise ValueError("Unsupported device target.")


@@ -133,7 +133,7 @@ def create_dataset_py(dataset_path, do_train, config, device_target, repeat_num=
if do_train: if do_train:
buffer_size = 20480 buffer_size = 20480
# apply shuffle operations # apply shuffle operations
ds = ds.shuffle(buffer_size=buffer_size)
data_set = data_set.shuffle(buffer_size=buffer_size)


# define map operations # define map operations
decode_op = P.Decode() decode_op = P.Decode()
@@ -152,12 +152,13 @@ def create_dataset_py(dataset_path, do_train, config, device_target, repeat_num=


compose = P2.Compose(trans) compose = P2.Compose(trans)


ds = ds.map(operations=compose, input_columns="image", num_parallel_workers=8, python_multiprocessing=True)
data_set = data_set.map(operations=compose, input_columns="image", num_parallel_workers=8,
python_multiprocessing=True)


# apply batch operations # apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
data_set = data_set.batch(batch_size, drop_remainder=True)


# apply dataset repeat operation # apply dataset repeat operation
ds = ds.repeat(repeat_num)
data_set = data_set.repeat(repeat_num)


return ds
return data_set

+ 11
- 11
model_zoo/official/cv/mobilenetv3/src/dataset.py View File

@@ -16,7 +16,7 @@
create train or eval dataset. create train or eval dataset.
""" """
import mindspore.common.dtype as mstype import mindspore.common.dtype as mstype
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.vision.c_transforms as C import mindspore.dataset.vision.c_transforms as C
import mindspore.dataset.transforms.c_transforms as C2 import mindspore.dataset.transforms.c_transforms as C2


@@ -38,12 +38,12 @@ def create_dataset(dataset_path, do_train, config, device_target, repeat_num=1,
if do_train: if do_train:
if run_distribute: if run_distribute:
from mindspore.communication.management import get_rank, get_group_size from mindspore.communication.management import get_rank, get_group_size
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=get_group_size(), shard_id=get_rank())
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=get_group_size(), shard_id=get_rank())
else: else:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
else: else:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
else: else:
raise ValueError("Unsupported device_target.") raise ValueError("Unsupported device_target.")


@@ -70,16 +70,16 @@ def create_dataset(dataset_path, do_train, config, device_target, repeat_num=1,


type_cast_op = C2.TypeCast(mstype.int32) type_cast_op = C2.TypeCast(mstype.int32)


ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8)
ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)
data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8)
data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)


# apply shuffle operations # apply shuffle operations
ds = ds.shuffle(buffer_size=buffer_size)
data_set = data_set.shuffle(buffer_size=buffer_size)


# apply batch operations # apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
data_set = data_set.batch(batch_size, drop_remainder=True)


# apply dataset repeat operation # apply dataset repeat operation
ds = ds.repeat(repeat_num)
data_set = data_set.repeat(repeat_num)


return ds
return data_set

+ 9
- 9
model_zoo/official/cv/nasnet/src/dataset.py View File

@@ -16,7 +16,7 @@
Data operations, will be used in train.py and eval.py Data operations, will be used in train.py and eval.py
""" """
import mindspore.common.dtype as mstype import mindspore.common.dtype as mstype
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.transforms.c_transforms as C2 import mindspore.dataset.transforms.c_transforms as C2
import mindspore.dataset.vision.c_transforms as C import mindspore.dataset.vision.c_transforms as C


@@ -37,10 +37,10 @@ def create_dataset(dataset_path, config, do_train, repeat_num=1):
rank = config.rank rank = config.rank
group_size = config.group_size group_size = config.group_size
if group_size == 1: if group_size == 1:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=config.work_nums, shuffle=True)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=config.work_nums, shuffle=True)
else: else:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=config.work_nums, shuffle=True,
num_shards=group_size, shard_id=rank)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=config.work_nums, shuffle=True,
num_shards=group_size, shard_id=rank)
# define map operations # define map operations
if do_train: if do_train:
trans = [ trans = [
@@ -60,10 +60,10 @@ def create_dataset(dataset_path, config, do_train, repeat_num=1):
C.HWC2CHW() C.HWC2CHW()
] ]
type_cast_op = C2.TypeCast(mstype.int32) type_cast_op = C2.TypeCast(mstype.int32)
ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=config.work_nums)
ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=config.work_nums)
data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=config.work_nums)
data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=config.work_nums)
# apply batch operations # apply batch operations
ds = ds.batch(config.batch_size, drop_remainder=True)
data_set = data_set.batch(config.batch_size, drop_remainder=True)
# apply dataset repeat operation # apply dataset repeat operation
ds = ds.repeat(repeat_num)
return ds
data_set = data_set.repeat(repeat_num)
return data_set

+ 26
- 10
model_zoo/official/cv/psenet/src/dataset.py View File

@@ -25,21 +25,24 @@ import pyclipper
from PIL import Image from PIL import Image
from src.config import config from src.config import config
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.vision.py_transforms as py_transforms import mindspore.dataset.vision.py_transforms as py_transforms
__all__ = ['train_dataset_creator', 'test_dataset_creator'] __all__ = ['train_dataset_creator', 'test_dataset_creator']
def get_img(img_path): def get_img(img_path):
img = cv2.imread(img_path) img = cv2.imread(img_path)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
return img return img
def get_imgs_names(root_dir): def get_imgs_names(root_dir):
img_paths = [i for i in os.listdir(root_dir) img_paths = [i for i in os.listdir(root_dir)
if os.path.splitext(i)[-1].lower() in ['.jpg', '.jpeg', '.png']] if os.path.splitext(i)[-1].lower() in ['.jpg', '.jpeg', '.png']]
return img_paths return img_paths
def get_bboxes(img, gt_path): def get_bboxes(img, gt_path):
h, w = img.shape[0:2] h, w = img.shape[0:2]
with open(gt_path, 'r', encoding='utf-8-sig') as f: with open(gt_path, 'r', encoding='utf-8-sig') as f:
@@ -58,6 +61,7 @@ def get_bboxes(img, gt_path):
tags.append(tag) tags.append(tag)
return np.array(bboxes), tags return np.array(bboxes), tags
def random_scale(img, min_size): def random_scale(img, min_size):
h, w = img.shape[0:2] h, w = img.shape[0:2]
if max(h, w) > 1280: if max(h, w) > 1280:
@@ -74,12 +78,14 @@ def random_scale(img, min_size):
img = cv2.resize(img, dsize=None, fx=scale2, fy=scale2) img = cv2.resize(img, dsize=None, fx=scale2, fy=scale2)
return img return img
def random_horizontal_flip(imgs): def random_horizontal_flip(imgs):
if random.random() < 0.5: if random.random() < 0.5:
for i, _ in enumerate(imgs): for i, _ in enumerate(imgs):
imgs[i] = np.flip(imgs[i], axis=1).copy() imgs[i] = np.flip(imgs[i], axis=1).copy()
return imgs return imgs
def random_rotate(imgs): def random_rotate(imgs):
max_angle = 10 max_angle = 10
angle = random.random() * 2 * max_angle - max_angle angle = random.random() * 2 * max_angle - max_angle
@@ -91,6 +97,7 @@ def random_rotate(imgs):
imgs[i] = img_rotation imgs[i] = img_rotation
return imgs return imgs
def random_crop(imgs, img_size): def random_crop(imgs, img_size):
h, w = imgs[0].shape[0:2] h, w = imgs[0].shape[0:2]
th, tw = img_size th, tw = img_size
@@ -118,21 +125,25 @@ def random_crop(imgs, img_size):
imgs[idx] = imgs[idx][i:i + th, j:j + tw] imgs[idx] = imgs[idx][i:i + th, j:j + tw]
return imgs return imgs
def scale(img, long_size=2240): def scale(img, long_size=2240):
h, w = img.shape[0:2] h, w = img.shape[0:2]
scale_long = long_size * 1.0 / max(h, w) scale_long = long_size * 1.0 / max(h, w)
img = cv2.resize(img, dsize=None, fx=scale_long, fy=scale_long) img = cv2.resize(img, dsize=None, fx=scale_long, fy=scale_long)
return img return img
def dist(a, b): def dist(a, b):
return np.sqrt(np.sum((a - b) ** 2)) return np.sqrt(np.sum((a - b) ** 2))
def perimeter(bbox): def perimeter(bbox):
peri = 0.0 peri = 0.0
for i in range(bbox.shape[0]): for i in range(bbox.shape[0]):
peri += dist(bbox[i], bbox[(i + 1) % bbox.shape[0]]) peri += dist(bbox[i], bbox[(i + 1) % bbox.shape[0]])
return peri return peri
def shrink(bboxes, rate, max_shr=20): def shrink(bboxes, rate, max_shr=20):
rate = rate * rate rate = rate * rate
shrinked_bboxes = [] shrinked_bboxes = []
@@ -158,6 +169,7 @@ def shrink(bboxes, rate, max_shr=20):
return np.array(shrinked_bboxes) return np.array(shrinked_bboxes)
class TrainDataset: class TrainDataset:
def __init__(self): def __init__(self):
self.is_transform = True self.is_transform = True
@@ -260,6 +272,7 @@ class TrainDataset:
def __len__(self): def __len__(self):
return len(self.all_img_paths) return len(self.all_img_paths)
def IC15_TEST_Generator(): def IC15_TEST_Generator():
ic15_test_data_dir = config.TEST_ROOT_DIR + 'ch4_test_images/' ic15_test_data_dir = config.TEST_ROOT_DIR + 'ch4_test_images/'
img_size = config.INFER_LONG_SIZE img_size = config.INFER_LONG_SIZE
@@ -298,6 +311,7 @@ def IC15_TEST_Generator():
yield img, img_resized, img_name yield img, img_resized, img_name
class DistributedSampler(): class DistributedSampler():
def __init__(self, dataset, rank, group_size, shuffle=True, seed=0): def __init__(self, dataset, rank, group_size, shuffle=True, seed=0):
self.dataset = dataset self.dataset = dataset
@@ -324,18 +338,20 @@ class DistributedSampler():
def __len__(self): def __len__(self):
return self.num_samplers return self.num_samplers
def train_dataset_creator(rank, group_size, shuffle=True): def train_dataset_creator(rank, group_size, shuffle=True):
cv2.setNumThreads(0) cv2.setNumThreads(0)
dataset = TrainDataset() dataset = TrainDataset()
sampler = DistributedSampler(dataset, rank, group_size, shuffle) sampler = DistributedSampler(dataset, rank, group_size, shuffle)
ds = de.GeneratorDataset(dataset, ['img', 'gt_text', 'gt_kernels', 'training_mask'], num_parallel_workers=8,
sampler=sampler)
ds = ds.repeat(1)
ds = ds.batch(config.TRAIN_BATCH_SIZE, drop_remainder=config.TRAIN_DROP_REMAINDER)
return ds
data_set = ds.GeneratorDataset(dataset, ['img', 'gt_text', 'gt_kernels', 'training_mask'], num_parallel_workers=8,
sampler=sampler)
data_set = data_set.repeat(1)
data_set = data_set.batch(config.TRAIN_BATCH_SIZE, drop_remainder=config.TRAIN_DROP_REMAINDER)
return data_set
def test_dataset_creator(): def test_dataset_creator():
ds = de.GeneratorDataset(IC15_TEST_Generator, ['img', 'img_resized', 'img_name'])
ds = ds.shuffle(config.TEST_BUFFER_SIZE)
ds = ds.batch(1, drop_remainder=config.TEST_DROP_REMAINDER)
return ds
data_set = ds.GeneratorDataset(IC15_TEST_Generator, ['img', 'img_resized', 'img_name'])
data_set = data_set.shuffle(config.TEST_BUFFER_SIZE)
data_set = data_set.batch(1, drop_remainder=config.TEST_DROP_REMAINDER)
return data_set

+ 19
- 11
model_zoo/official/cv/resnet/gpu_resnet_benchmark.py View File

@@ -29,7 +29,7 @@ from mindspore.train.serialization import load_checkpoint, load_param_into_net
from mindspore.common import set_seed from mindspore.common import set_seed
import mindspore.nn as nn import mindspore.nn as nn
import mindspore.common.initializer as weight_init import mindspore.common.initializer as weight_init
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.vision.c_transforms as C import mindspore.dataset.vision.c_transforms as C
from src.resnet_gpu_benchmark import resnet50 as resnet from src.resnet_gpu_benchmark import resnet50 as resnet
from src.CrossEntropySmooth import CrossEntropySmooth from src.CrossEntropySmooth import CrossEntropySmooth
@@ -45,19 +45,22 @@ parser.add_argument('--dataset_path', type=str, default=None, help='Imagenet dat
parser.add_argument('--ckpt_path', type=str, default="./", help='The path to save ckpt if save_ckpt is True;\ parser.add_argument('--ckpt_path', type=str, default="./", help='The path to save ckpt if save_ckpt is True;\
Or the ckpt model file when eval is True') Or the ckpt model file when eval is True')
parser.add_argument('--mode', type=str, default="GRAPH", choices=["GRAPH", "PYNATIVE"], help='Execute mode') parser.add_argument('--mode', type=str, default="GRAPH", choices=["GRAPH", "PYNATIVE"], help='Execute mode')
parser.add_argument('--dtype', type=str, choices=["fp32", "fp16", "FP16", "FP32"], default="fp16",\
help='Compute data type fp32 or fp16: default fp16')
parser.add_argument('--dtype', type=str, choices=["fp32", "fp16", "FP16", "FP32"], default="fp16", \
help='Compute data type fp32 or fp16: default fp16')
args_opt = parser.parse_args() args_opt = parser.parse_args()


set_seed(1) set_seed(1)



class MyTimeMonitor(Callback): class MyTimeMonitor(Callback):
def __init__(self, batch_size, sink_size): def __init__(self, batch_size, sink_size):
super(MyTimeMonitor, self).__init__() super(MyTimeMonitor, self).__init__()
self.batch_size = batch_size self.batch_size = batch_size
self.size = sink_size self.size = sink_size

def step_begin(self, run_context): def step_begin(self, run_context):
self.step_time = time.time() self.step_time = time.time()

def step_end(self, run_context): def step_end(self, run_context):
cb_params = run_context.original_args() cb_params = run_context.original_args()
loss = cb_params.net_outputs loss = cb_params.net_outputs
@@ -75,17 +78,18 @@ class MyTimeMonitor(Callback):
raise ValueError("epoch: {} step: {}. Invalid loss, terminating training.".format( raise ValueError("epoch: {} step: {}. Invalid loss, terminating training.".format(
cb_params.cur_epoch_num, cur_step_in_epoch)) cb_params.cur_epoch_num, cur_step_in_epoch))
step_mseconds = (time.time() - self.step_time) * 1000 step_mseconds = (time.time() - self.step_time) * 1000
fps = self.batch_size / step_mseconds *1000 * self.size
fps = self.batch_size / step_mseconds * 1000 * self.size
print("epoch: %s step: %s, loss is %s" % (cb_params.cur_epoch_num, cur_step_in_epoch, loss), print("epoch: %s step: %s, loss is %s" % (cb_params.cur_epoch_num, cur_step_in_epoch, loss),
"Epoch time: {:5.3f} ms, fps: {:d} img/sec.".format(step_mseconds, int(fps)), flush=True) "Epoch time: {:5.3f} ms, fps: {:d} img/sec.".format(step_mseconds, int(fps)), flush=True)



def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target="GPU", dtype="fp16", def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target="GPU", dtype="fp16",
device_num=1): device_num=1):
if device_num == 1: if device_num == 1:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=4, shuffle=True)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=4, shuffle=True)
else: else:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=4, shuffle=True,
num_shards=device_num, shard_id=get_rank())
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=4, shuffle=True,
num_shards=device_num, shard_id=get_rank())
image_size = 224 image_size = 224
mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] mean = [0.485 * 255, 0.456 * 255, 0.406 * 255]
std = [0.229 * 255, 0.224 * 255, 0.225 * 255] std = [0.229 * 255, 0.224 * 255, 0.225 * 255]
@@ -113,14 +117,15 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target="
] ]
if dtype == "fp32": if dtype == "fp32":
trans.append(C.HWC2CHW()) trans.append(C.HWC2CHW())
ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8)
data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8)
# apply batch operations # apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
data_set = data_set.batch(batch_size, drop_remainder=True)
# apply dataset repeat operation # apply dataset repeat operation
if repeat_num > 1: if repeat_num > 1:
ds = ds.repeat(repeat_num)
data_set = data_set.repeat(repeat_num)

return data_set


return ds


def get_liner_lr(lr_init, lr_end, lr_max, warmup_epochs, total_epochs, steps_per_epoch): def get_liner_lr(lr_init, lr_end, lr_max, warmup_epochs, total_epochs, steps_per_epoch):
lr_each_step = [] lr_each_step = []
@@ -136,6 +141,7 @@ def get_liner_lr(lr_init, lr_end, lr_max, warmup_epochs, total_epochs, steps_per
lr_each_step = np.array(lr_each_step).astype(np.float32) lr_each_step = np.array(lr_each_step).astype(np.float32)
return lr_each_step return lr_each_step



def train(): def train():
# set args # set args
dev = "GPU" dev = "GPU"
@@ -221,6 +227,7 @@ def train():
else: else:
model.train(epoch_size, dataset, callbacks=cb) model.train(epoch_size, dataset, callbacks=cb)



def eval_(): def eval_():
# set args # set args
dev = "GPU" dev = "GPU"
@@ -251,6 +258,7 @@ def eval_():
res = model.eval(dataset) res = model.eval(dataset)
print("result:", res, "ckpt=", ckpt_dir) print("result:", res, "ckpt=", ckpt_dir)



if __name__ == '__main__': if __name__ == '__main__':
if not args_opt.eval: if not args_opt.eval:
train() train()


+ 33
- 33
model_zoo/official/cv/resnet/src/dataset.py View File

@@ -17,7 +17,7 @@ create train or eval dataset.
""" """
import os import os
import mindspore.common.dtype as mstype import mindspore.common.dtype as mstype
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.vision.c_transforms as C import mindspore.dataset.vision.c_transforms as C
import mindspore.dataset.transforms.c_transforms as C2 import mindspore.dataset.transforms.c_transforms as C2
from mindspore.communication.management import init, get_rank, get_group_size from mindspore.communication.management import init, get_rank, get_group_size
@@ -47,10 +47,10 @@ def create_dataset1(dataset_path, do_train, repeat_num=1, batch_size=32, target=
else: else:
device_num = 1 device_num = 1
if device_num == 1: if device_num == 1:
ds = de.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True)
data_set = ds.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True)
else: else:
ds = de.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=device_num, shard_id=rank_id)
data_set = ds.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=device_num, shard_id=rank_id)


# define map operations # define map operations
trans = [] trans = []
@@ -69,15 +69,15 @@ def create_dataset1(dataset_path, do_train, repeat_num=1, batch_size=32, target=


type_cast_op = C2.TypeCast(mstype.int32) type_cast_op = C2.TypeCast(mstype.int32)


ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)
ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8)
data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)
data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8)


# apply batch operations # apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
data_set = data_set.batch(batch_size, drop_remainder=True)
# apply dataset repeat operation # apply dataset repeat operation
ds = ds.repeat(repeat_num)
data_set = data_set.repeat(repeat_num)


return ds
return data_set




def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend", distribute=False): def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend", distribute=False):
@@ -106,10 +106,10 @@ def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, target=
device_num = 1 device_num = 1


if device_num == 1: if device_num == 1:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
else: else:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=device_num, shard_id=rank_id)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=device_num, shard_id=rank_id)


image_size = 224 image_size = 224
mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] mean = [0.485 * 255, 0.456 * 255, 0.406 * 255]
@@ -134,16 +134,16 @@ def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, target=


type_cast_op = C2.TypeCast(mstype.int32) type_cast_op = C2.TypeCast(mstype.int32)


ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8)
ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)
data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8)
data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)


# apply batch operations # apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
data_set = data_set.batch(batch_size, drop_remainder=True)


# apply dataset repeat operation # apply dataset repeat operation
ds = ds.repeat(repeat_num)
data_set = data_set.repeat(repeat_num)


return ds
return data_set




def create_dataset3(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend", distribute=False): def create_dataset3(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend", distribute=False):
@@ -171,10 +171,10 @@ def create_dataset3(dataset_path, do_train, repeat_num=1, batch_size=32, target=
device_num = 1 device_num = 1
rank_id = 1 rank_id = 1
if device_num == 1: if device_num == 1:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
else: else:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=device_num, shard_id=rank_id)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=device_num, shard_id=rank_id)
image_size = 224 image_size = 224
mean = [0.475 * 255, 0.451 * 255, 0.392 * 255] mean = [0.475 * 255, 0.451 * 255, 0.392 * 255]
std = [0.275 * 255, 0.267 * 255, 0.278 * 255] std = [0.275 * 255, 0.267 * 255, 0.278 * 255]
@@ -198,15 +198,15 @@ def create_dataset3(dataset_path, do_train, repeat_num=1, batch_size=32, target=


type_cast_op = C2.TypeCast(mstype.int32) type_cast_op = C2.TypeCast(mstype.int32)


ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8)
ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)
data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8)
data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)


# apply batch operations # apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
data_set = data_set.batch(batch_size, drop_remainder=True)
# apply dataset repeat operation # apply dataset repeat operation
ds = ds.repeat(repeat_num)
data_set = data_set.repeat(repeat_num)


return ds
return data_set




def create_dataset4(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend", distribute=False): def create_dataset4(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend", distribute=False):
@@ -234,10 +234,10 @@ def create_dataset4(dataset_path, do_train, repeat_num=1, batch_size=32, target=
else: else:
device_num = 1 device_num = 1
if device_num == 1: if device_num == 1:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=12, shuffle=True)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=12, shuffle=True)
else: else:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=12, shuffle=True,
num_shards=device_num, shard_id=rank_id)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=12, shuffle=True,
num_shards=device_num, shard_id=rank_id)
image_size = 224 image_size = 224
mean = [123.68, 116.78, 103.94] mean = [123.68, 116.78, 103.94]
std = [1.0, 1.0, 1.0] std = [1.0, 1.0, 1.0]
@@ -260,16 +260,16 @@ def create_dataset4(dataset_path, do_train, repeat_num=1, batch_size=32, target=
] ]


type_cast_op = C2.TypeCast(mstype.int32) type_cast_op = C2.TypeCast(mstype.int32)
ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=12)
ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=12)
data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=12)
data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=12)


# apply batch operations # apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
data_set = data_set.batch(batch_size, drop_remainder=True)


# apply dataset repeat operation # apply dataset repeat operation
ds = ds.repeat(repeat_num)
data_set = data_set.repeat(repeat_num)


return ds
return data_set




def _get_rank_info(): def _get_rank_info():


+ 20
- 19
model_zoo/official/cv/resnet50_quant/src/dataset.py View File

@@ -18,7 +18,7 @@ create train or eval dataset.
import os import os
from functools import partial from functools import partial
import mindspore.common.dtype as mstype import mindspore.common.dtype as mstype
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.transforms.c_transforms as C2 import mindspore.dataset.transforms.c_transforms as C2
import mindspore.dataset.vision.c_transforms as C import mindspore.dataset.vision.c_transforms as C
import mindspore.dataset.transforms.py_transforms as P2 import mindspore.dataset.transforms.py_transforms as P2
@@ -53,14 +53,14 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target="


columns_list = ['image', 'label'] columns_list = ['image', 'label']
if config.data_load_mode == "mindrecord": if config.data_load_mode == "mindrecord":
load_func = partial(de.MindDataset, dataset_path, columns_list)
load_func = partial(ds.MindDataset, dataset_path, columns_list)
else: else:
load_func = partial(de.ImageFolderDataset, dataset_path)
load_func = partial(ds.ImageFolderDataset, dataset_path)
if device_num == 1: if device_num == 1:
ds = load_func(num_parallel_workers=8, shuffle=True)
data_set = load_func(num_parallel_workers=8, shuffle=True)
else: else:
ds = load_func(num_parallel_workers=8, shuffle=True,
num_shards=device_num, shard_id=rank_id)
data_set = load_func(num_parallel_workers=8, shuffle=True,
num_shards=device_num, shard_id=rank_id)


image_size = 224 image_size = 224
mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] mean = [0.485 * 255, 0.456 * 255, 0.406 * 255]
@@ -85,16 +85,16 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target="


type_cast_op = C2.TypeCast(mstype.int32) type_cast_op = C2.TypeCast(mstype.int32)


ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8)
ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)
data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8)
data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)


# apply batch operations # apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
data_set = data_set.batch(batch_size, drop_remainder=True)


# apply dataset repeat operation # apply dataset repeat operation
ds = ds.repeat(repeat_num)
data_set = data_set.repeat(repeat_num)


return ds
return data_set




def create_dataset_py(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend"): def create_dataset_py(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend"):
@@ -121,12 +121,12 @@ def create_dataset_py(dataset_path, do_train, repeat_num=1, batch_size=32, targe


if do_train: if do_train:
if device_num == 1: if device_num == 1:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
else: else:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=device_num, shard_id=rank_id)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=device_num, shard_id=rank_id)
else: else:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=False)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=False)


image_size = 224 image_size = 224


@@ -147,12 +147,13 @@ def create_dataset_py(dataset_path, do_train, repeat_num=1, batch_size=32, targe
trans = [decode_op, resize_op, center_crop, to_tensor, normalize_op] trans = [decode_op, resize_op, center_crop, to_tensor, normalize_op]


compose = P2.Compose(trans) compose = P2.Compose(trans)
ds = ds.map(operations=compose, input_columns="image", num_parallel_workers=8, python_multiprocessing=True)
data_set = data_set.map(operations=compose, input_columns="image", num_parallel_workers=8,
python_multiprocessing=True)


# apply batch operations # apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
data_set = data_set.batch(batch_size, drop_remainder=True)


# apply dataset repeat operation # apply dataset repeat operation
ds = ds.repeat(repeat_num)
data_set = data_set.repeat(repeat_num)


return ds
return data_set

+ 9
- 9
model_zoo/official/cv/resnet_thor/src/dataset.py View File

@@ -17,7 +17,7 @@ create train or eval dataset.
""" """
import os import os
import mindspore.common.dtype as mstype import mindspore.common.dtype as mstype
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.vision.c_transforms as C import mindspore.dataset.vision.c_transforms as C
import mindspore.dataset.transforms.c_transforms as C2 import mindspore.dataset.transforms.c_transforms as C2
from mindspore.communication.management import init, get_rank, get_group_size from mindspore.communication.management import init, get_rank, get_group_size
@@ -47,10 +47,10 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target="
num_parallels = 4 num_parallels = 4


if device_num == 1: if device_num == 1:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=num_parallels, shuffle=True)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=num_parallels, shuffle=True)
else: else:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=num_parallels, shuffle=True,
num_shards=device_num, shard_id=rank_id)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=num_parallels, shuffle=True,
num_shards=device_num, shard_id=rank_id)


image_size = 224 image_size = 224
mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] mean = [0.485 * 255, 0.456 * 255, 0.406 * 255]
@@ -75,16 +75,16 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target="


type_cast_op = C2.TypeCast(mstype.int32) type_cast_op = C2.TypeCast(mstype.int32)


ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=num_parallels)
ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=num_parallels)
data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=num_parallels)
data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=num_parallels)


# apply batch operations # apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
data_set = data_set.batch(batch_size, drop_remainder=True)


# apply dataset repeat operation # apply dataset repeat operation
ds = ds.repeat(repeat_num)
data_set = data_set.repeat(repeat_num)


return ds
return data_set




def _get_rank_info(): def _get_rank_info():


+ 8
- 8
model_zoo/official/cv/shufflenetv1/src/dataset.py View File

@@ -15,7 +15,7 @@
"""Data operations, will be used in train.py and eval.py""" """Data operations, will be used in train.py and eval.py"""
from src.config import config from src.config import config
import mindspore.common.dtype as mstype import mindspore.common.dtype as mstype
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.transforms.c_transforms as C2 import mindspore.dataset.transforms.c_transforms as C2
import mindspore.dataset.vision.c_transforms as C import mindspore.dataset.vision.c_transforms as C


@@ -36,10 +36,10 @@ def create_dataset(dataset_path, do_train, device_num=1, rank=0):
""" """


if device_num == 1: if device_num == 1:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
else: else:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=device_num, shard_id=rank)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=device_num, shard_id=rank)
# define map operations # define map operations
if do_train: if do_train:
trans = [ trans = [
@@ -59,8 +59,8 @@ def create_dataset(dataset_path, do_train, device_num=1, rank=0):
] ]


type_cast_op = C2.TypeCast(mstype.int32) type_cast_op = C2.TypeCast(mstype.int32)
ds = ds.map(input_columns="image", operations=trans, num_parallel_workers=8)
ds = ds.map(input_columns="label", operations=type_cast_op, num_parallel_workers=8)
data_set = data_set.map(input_columns="image", operations=trans, num_parallel_workers=8)
data_set = data_set.map(input_columns="label", operations=type_cast_op, num_parallel_workers=8)
# apply batch operations # apply batch operations
ds = ds.batch(config.batch_size, drop_remainder=True)
return ds
data_set = data_set.batch(config.batch_size, drop_remainder=True)
return data_set

+ 8
- 8
model_zoo/official/cv/shufflenetv2/src/dataset.py View File

@@ -19,7 +19,7 @@ import numpy as np
from src.config import config_gpu as cfg from src.config import config_gpu as cfg


import mindspore.common.dtype as mstype import mindspore.common.dtype as mstype
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.transforms.c_transforms as C2 import mindspore.dataset.transforms.c_transforms as C2
import mindspore.dataset.vision.c_transforms as C import mindspore.dataset.vision.c_transforms as C


@@ -46,10 +46,10 @@ def create_dataset(dataset_path, do_train, rank, group_size, repeat_num=1):
dataset dataset
""" """
if group_size == 1: if group_size == 1:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=cfg.work_nums, shuffle=True)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=cfg.work_nums, shuffle=True)
else: else:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=cfg.work_nums, shuffle=True,
num_shards=group_size, shard_id=rank)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=cfg.work_nums, shuffle=True,
num_shards=group_size, shard_id=rank)
# define map operations # define map operations
if do_train: if do_train:
trans = [ trans = [
@@ -71,9 +71,9 @@ def create_dataset(dataset_path, do_train, rank, group_size, repeat_num=1):
] ]


type_cast_op = C2.TypeCast(mstype.int32) type_cast_op = C2.TypeCast(mstype.int32)
ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=cfg.work_nums)
ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=cfg.work_nums)
data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=cfg.work_nums)
data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=cfg.work_nums)
# apply batch operations # apply batch operations
ds = ds.batch(cfg.batch_size, drop_remainder=True)
data_set = data_set.batch(cfg.batch_size, drop_remainder=True)


return ds
return data_set

+ 35
- 35
model_zoo/official/cv/squeezenet/src/dataset.py View File

@@ -17,7 +17,7 @@ create train or eval dataset.
""" """
import os import os
import mindspore.common.dtype as mstype import mindspore.common.dtype as mstype
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.vision.c_transforms as C import mindspore.dataset.vision.c_transforms as C
import mindspore.dataset.transforms.c_transforms as C2 import mindspore.dataset.transforms.c_transforms as C2
from mindspore.communication.management import init, get_rank, get_group_size from mindspore.communication.management import init, get_rank, get_group_size
@@ -48,15 +48,15 @@ def create_dataset_cifar(dataset_path,
device_num = get_group_size() device_num = get_group_size()


if device_num == 1: if device_num == 1:
ds = de.Cifar10Dataset(dataset_path,
num_parallel_workers=8,
shuffle=True)
data_set = ds.Cifar10Dataset(dataset_path,
num_parallel_workers=8,
shuffle=True)
else: else:
ds = de.Cifar10Dataset(dataset_path,
num_parallel_workers=8,
shuffle=True,
num_shards=device_num,
shard_id=rank_id)
data_set = ds.Cifar10Dataset(dataset_path,
num_parallel_workers=8,
shuffle=True,
num_shards=device_num,
shard_id=rank_id)


# define map operations # define map operations
if do_train: if do_train:
@@ -80,20 +80,20 @@ def create_dataset_cifar(dataset_path,


type_cast_op = C2.TypeCast(mstype.int32) type_cast_op = C2.TypeCast(mstype.int32)


ds = ds.map(operations=type_cast_op,
input_columns="label",
num_parallel_workers=8)
ds = ds.map(operations=trans,
input_columns="image",
num_parallel_workers=8)
data_set = data_set.map(operations=type_cast_op,
input_columns="label",
num_parallel_workers=8)
data_set = data_set.map(operations=trans,
input_columns="image",
num_parallel_workers=8)


# apply batch operations # apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
data_set = data_set.batch(batch_size, drop_remainder=True)


# apply dataset repeat operation # apply dataset repeat operation
ds = ds.repeat(repeat_num)
data_set = data_set.repeat(repeat_num)


return ds
return data_set




def create_dataset_imagenet(dataset_path, def create_dataset_imagenet(dataset_path,
@@ -122,15 +122,15 @@ def create_dataset_imagenet(dataset_path,
device_num = get_group_size() device_num = get_group_size()


if device_num == 1: if device_num == 1:
ds = de.ImageFolderDataset(dataset_path,
num_parallel_workers=8,
shuffle=True)
data_set = ds.ImageFolderDataset(dataset_path,
num_parallel_workers=8,
shuffle=True)
else: else:
ds = de.ImageFolderDataset(dataset_path,
num_parallel_workers=8,
shuffle=True,
num_shards=device_num,
shard_id=rank_id)
data_set = ds.ImageFolderDataset(dataset_path,
num_parallel_workers=8,
shuffle=True,
num_shards=device_num,
shard_id=rank_id)


image_size = 227 image_size = 227
mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] mean = [0.485 * 255, 0.456 * 255, 0.406 * 255]
@@ -159,20 +159,20 @@ def create_dataset_imagenet(dataset_path,


type_cast_op = C2.TypeCast(mstype.int32) type_cast_op = C2.TypeCast(mstype.int32)


ds = ds.map(operations=type_cast_op,
input_columns="label",
num_parallel_workers=8)
ds = ds.map(operations=trans,
input_columns="image",
num_parallel_workers=8)
data_set = data_set.map(operations=type_cast_op,
input_columns="label",
num_parallel_workers=8)
data_set = data_set.map(operations=trans,
input_columns="image",
num_parallel_workers=8)


# apply batch operations # apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
data_set = data_set.batch(batch_size, drop_remainder=True)


# apply dataset repeat operation # apply dataset repeat operation
ds = ds.repeat(repeat_num)
data_set = data_set.repeat(repeat_num)


return ds
return data_set




def _get_rank_info(): def _get_rank_info():


+ 8
- 8
model_zoo/official/cv/warpctc/src/dataset.py View File

@@ -17,7 +17,7 @@ import os
import math as m import math as m
import numpy as np import numpy as np
import mindspore.common.dtype as mstype import mindspore.common.dtype as mstype
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.transforms.c_transforms as c import mindspore.dataset.transforms.c_transforms as c
import mindspore.dataset.vision.c_transforms as vc import mindspore.dataset.vision.c_transforms as vc
from PIL import Image from PIL import Image
@@ -86,7 +86,7 @@ def create_dataset(dataset_path, batch_size=1, num_shards=1, shard_id=0, device_
""" """


dataset = _CaptchaDataset(dataset_path, cf.max_captcha_digits, device_target) dataset = _CaptchaDataset(dataset_path, cf.max_captcha_digits, device_target)
ds = de.GeneratorDataset(dataset, ["image", "label"], shuffle=True, num_shards=num_shards, shard_id=shard_id)
data_set = ds.GeneratorDataset(dataset, ["image", "label"], shuffle=True, num_shards=num_shards, shard_id=shard_id)
image_trans = [ image_trans = [
vc.Rescale(1.0 / 255.0, 0.0), vc.Rescale(1.0 / 255.0, 0.0),
vc.Normalize([0.9010, 0.9049, 0.9025], std=[0.1521, 0.1347, 0.1458]), vc.Normalize([0.9010, 0.9049, 0.9025], std=[0.1521, 0.1347, 0.1458]),
@@ -96,12 +96,12 @@ def create_dataset(dataset_path, batch_size=1, num_shards=1, shard_id=0, device_
label_trans = [ label_trans = [
c.TypeCast(mstype.int32) c.TypeCast(mstype.int32)
] ]
ds = ds.map(operations=image_trans, input_columns=["image"], num_parallel_workers=8)
data_set = data_set.map(operations=image_trans, input_columns=["image"], num_parallel_workers=8)
if device_target == 'Ascend': if device_target == 'Ascend':
ds = ds.map(operations=transpose_hwc2whc, input_columns=["image"], num_parallel_workers=8)
data_set = data_set.map(operations=transpose_hwc2whc, input_columns=["image"], num_parallel_workers=8)
else: else:
ds = ds.map(operations=transpose_hwc2chw, input_columns=["image"], num_parallel_workers=8)
ds = ds.map(operations=label_trans, input_columns=["label"], num_parallel_workers=8)
data_set = data_set.map(operations=transpose_hwc2chw, input_columns=["image"], num_parallel_workers=8)
data_set = data_set.map(operations=label_trans, input_columns=["label"], num_parallel_workers=8)


ds = ds.batch(batch_size, drop_remainder=True)
return ds
data_set = data_set.batch(batch_size, drop_remainder=True)
return data_set

+ 9
- 8
model_zoo/official/cv/xception/src/dataset.py View File

@@ -16,10 +16,11 @@
Data operations, will be used in train.py and eval.py Data operations, will be used in train.py and eval.py
""" """
import mindspore.common.dtype as mstype import mindspore.common.dtype as mstype
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.transforms.c_transforms as C2 import mindspore.dataset.transforms.c_transforms as C2
import mindspore.dataset.vision.c_transforms as C import mindspore.dataset.vision.c_transforms as C



def create_dataset(dataset_path, do_train, batch_size=16, device_num=1, rank=0): def create_dataset(dataset_path, do_train, batch_size=16, device_num=1, rank=0):
""" """
create a train or eval dataset create a train or eval dataset
@@ -35,10 +36,10 @@ def create_dataset(dataset_path, do_train, batch_size=16, device_num=1, rank=0):
dataset dataset
""" """
if device_num == 1: if device_num == 1:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
else: else:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=device_num, shard_id=rank)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=device_num, shard_id=rank)
# define map operations # define map operations
if do_train: if do_train:
trans = [ trans = [
@@ -59,8 +60,8 @@ def create_dataset(dataset_path, do_train, batch_size=16, device_num=1, rank=0):
] ]


type_cast_op = C2.TypeCast(mstype.int32) type_cast_op = C2.TypeCast(mstype.int32)
ds = ds.map(input_columns="image", operations=trans, num_parallel_workers=8)
ds = ds.map(input_columns="label", operations=type_cast_op, num_parallel_workers=8)
data_set = data_set.map(input_columns="image", operations=trans, num_parallel_workers=8)
data_set = data_set.map(input_columns="label", operations=type_cast_op, num_parallel_workers=8)
# apply batch operations # apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
return ds
data_set = data_set.batch(batch_size, drop_remainder=True)
return data_set

+ 54
- 52
model_zoo/official/nlp/bert/src/dataset.py View File

@@ -17,7 +17,7 @@ Data operations, will be used in run_pretrain.py
""" """
import os import os
import mindspore.common.dtype as mstype import mindspore.common.dtype as mstype
import mindspore.dataset.engine.datasets as de
import mindspore.dataset as ds
import mindspore.dataset.transforms.c_transforms as C import mindspore.dataset.transforms.c_transforms as C
from mindspore import log as logger from mindspore import log as logger
from .config import cfg from .config import cfg
@@ -31,65 +31,67 @@ def create_bert_dataset(device_num=1, rank=0, do_shuffle="true", data_dir=None,
for file_name in files: for file_name in files:
if "tfrecord" in file_name: if "tfrecord" in file_name:
data_files.append(os.path.join(data_dir, file_name)) data_files.append(os.path.join(data_dir, file_name))
ds = de.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None,
columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels",
"masked_lm_positions", "masked_lm_ids", "masked_lm_weights"],
shuffle=de.Shuffle.FILES if do_shuffle == "true" else False,
num_shards=device_num, shard_id=rank, shard_equal_rows=True)
ori_dataset_size = ds.get_dataset_size()
data_set = ds.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None,
columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels",
"masked_lm_positions", "masked_lm_ids", "masked_lm_weights"],
shuffle=ds.Shuffle.FILES if do_shuffle == "true" else False,
num_shards=device_num, shard_id=rank, shard_equal_rows=True)
ori_dataset_size = data_set.get_dataset_size()
print('origin dataset size: ', ori_dataset_size) print('origin dataset size: ', ori_dataset_size)
type_cast_op = C.TypeCast(mstype.int32) type_cast_op = C.TypeCast(mstype.int32)
ds = ds.map(operations=type_cast_op, input_columns="masked_lm_ids")
ds = ds.map(operations=type_cast_op, input_columns="masked_lm_positions")
ds = ds.map(operations=type_cast_op, input_columns="next_sentence_labels")
ds = ds.map(operations=type_cast_op, input_columns="segment_ids")
ds = ds.map(operations=type_cast_op, input_columns="input_mask")
ds = ds.map(operations=type_cast_op, input_columns="input_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_positions")
data_set = data_set.map(operations=type_cast_op, input_columns="next_sentence_labels")
data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="input_mask")
data_set = data_set.map(operations=type_cast_op, input_columns="input_ids")
# apply batch operations # apply batch operations
ds = ds.batch(cfg.batch_size, drop_remainder=True)
logger.info("data size: {}".format(ds.get_dataset_size()))
logger.info("repeat count: {}".format(ds.get_repeat_count()))
return ds
data_set = data_set.batch(cfg.batch_size, drop_remainder=True)
logger.info("data size: {}".format(data_set.get_dataset_size()))
logger.info("repeat count: {}".format(data_set.get_repeat_count()))
return data_set




def create_ner_dataset(batch_size=1, repeat_count=1, assessment_method="accuracy", def create_ner_dataset(batch_size=1, repeat_count=1, assessment_method="accuracy",
data_file_path=None, schema_file_path=None, do_shuffle=True): data_file_path=None, schema_file_path=None, do_shuffle=True):
"""create finetune or evaluation dataset""" """create finetune or evaluation dataset"""
type_cast_op = C.TypeCast(mstype.int32) type_cast_op = C.TypeCast(mstype.int32)
ds = de.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None,
columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"], shuffle=do_shuffle)
data_set = ds.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None,
columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"],
shuffle=do_shuffle)
if assessment_method == "Spearman_correlation": if assessment_method == "Spearman_correlation":
type_cast_op_float = C.TypeCast(mstype.float32) type_cast_op_float = C.TypeCast(mstype.float32)
ds = ds.map(operations=type_cast_op_float, input_columns="label_ids")
data_set = data_set.map(operations=type_cast_op_float, input_columns="label_ids")
else: else:
ds = ds.map(operations=type_cast_op, input_columns="label_ids")
ds = ds.map(operations=type_cast_op, input_columns="segment_ids")
ds = ds.map(operations=type_cast_op, input_columns="input_mask")
ds = ds.map(operations=type_cast_op, input_columns="input_ids")
ds = ds.repeat(repeat_count)
data_set = data_set.map(operations=type_cast_op, input_columns="label_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="input_mask")
data_set = data_set.map(operations=type_cast_op, input_columns="input_ids")
data_set = data_set.repeat(repeat_count)
# apply batch operations # apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
return ds
data_set = data_set.batch(batch_size, drop_remainder=True)
return data_set




def create_classification_dataset(batch_size=1, repeat_count=1, assessment_method="accuracy", def create_classification_dataset(batch_size=1, repeat_count=1, assessment_method="accuracy",
data_file_path=None, schema_file_path=None, do_shuffle=True): data_file_path=None, schema_file_path=None, do_shuffle=True):
"""create finetune or evaluation dataset""" """create finetune or evaluation dataset"""
type_cast_op = C.TypeCast(mstype.int32) type_cast_op = C.TypeCast(mstype.int32)
ds = de.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None,
columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"], shuffle=do_shuffle)
data_set = ds.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None,
columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"],
shuffle=do_shuffle)
if assessment_method == "Spearman_correlation": if assessment_method == "Spearman_correlation":
type_cast_op_float = C.TypeCast(mstype.float32) type_cast_op_float = C.TypeCast(mstype.float32)
ds = ds.map(operations=type_cast_op_float, input_columns="label_ids")
data_set = data_set.map(operations=type_cast_op_float, input_columns="label_ids")
else: else:
ds = ds.map(operations=type_cast_op, input_columns="label_ids")
ds = ds.map(operations=type_cast_op, input_columns="segment_ids")
ds = ds.map(operations=type_cast_op, input_columns="input_mask")
ds = ds.map(operations=type_cast_op, input_columns="input_ids")
ds = ds.repeat(repeat_count)
data_set = data_set.map(operations=type_cast_op, input_columns="label_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="input_mask")
data_set = data_set.map(operations=type_cast_op, input_columns="input_ids")
data_set = data_set.repeat(repeat_count)
# apply batch operations # apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
return ds
data_set = data_set.batch(batch_size, drop_remainder=True)
return data_set




def generator_squad(data_features): def generator_squad(data_features):
@@ -102,20 +104,20 @@ def create_squad_dataset(batch_size=1, repeat_count=1, data_file_path=None, sche
"""create finetune or evaluation dataset""" """create finetune or evaluation dataset"""
type_cast_op = C.TypeCast(mstype.int32) type_cast_op = C.TypeCast(mstype.int32)
if is_training: if is_training:
ds = de.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None,
columns_list=["input_ids", "input_mask", "segment_ids", "start_positions",
"end_positions", "unique_ids", "is_impossible"],
shuffle=do_shuffle)
ds = ds.map(operations=type_cast_op, input_columns="start_positions")
ds = ds.map(operations=type_cast_op, input_columns="end_positions")
data_set = ds.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None,
columns_list=["input_ids", "input_mask", "segment_ids", "start_positions",
"end_positions", "unique_ids", "is_impossible"],
shuffle=do_shuffle)
data_set = data_set.map(operations=type_cast_op, input_columns="start_positions")
data_set = data_set.map(operations=type_cast_op, input_columns="end_positions")
else: else:
ds = de.GeneratorDataset(generator_squad(data_file_path), shuffle=do_shuffle,
column_names=["input_ids", "input_mask", "segment_ids", "unique_ids"])
ds = ds.map(operations=type_cast_op, input_columns="segment_ids")
ds = ds.map(operations=type_cast_op, input_columns="input_mask")
ds = ds.map(operations=type_cast_op, input_columns="input_ids")
ds = ds.map(operations=type_cast_op, input_columns="unique_ids")
ds = ds.repeat(repeat_count)
data_set = ds.GeneratorDataset(generator_squad(data_file_path), shuffle=do_shuffle,
column_names=["input_ids", "input_mask", "segment_ids", "unique_ids"])
data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="input_mask")
data_set = data_set.map(operations=type_cast_op, input_columns="input_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="unique_ids")
data_set = data_set.repeat(repeat_count)
# apply batch operations # apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
return ds
data_set = data_set.batch(batch_size, drop_remainder=True)
return data_set

+ 57
- 57
model_zoo/official/nlp/bert_thor/src/dataset.py View File

@@ -17,7 +17,7 @@ Data operations, will be used in run_pretrain.py
""" """
import os import os
import mindspore.common.dtype as mstype import mindspore.common.dtype as mstype
import mindspore.dataset.engine.datasets as de
import mindspore.dataset as ds
import mindspore.dataset.transforms.c_transforms as C import mindspore.dataset.transforms.c_transforms as C
from mindspore import log as logger from mindspore import log as logger
from .bert_net_config import bert_net_cfg from .bert_net_config import bert_net_cfg
@@ -32,96 +32,96 @@ def create_bert_dataset(device_num=1, rank=0, do_shuffle="true", data_dir=None,
if "tfrecord" in file_name: if "tfrecord" in file_name:
data_files.append(os.path.join(data_dir, file_name)) data_files.append(os.path.join(data_dir, file_name))
data_files = sorted(data_files) data_files = sorted(data_files)
ds = de.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None,
columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels",
"masked_lm_positions", "masked_lm_ids", "masked_lm_weights"],
shuffle=de.Shuffle.FILES if do_shuffle == "true" else False,
num_shards=device_num, shard_id=rank, shard_equal_rows=False)
ori_dataset_size = ds.get_dataset_size()
data_set = ds.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None,
columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels",
"masked_lm_positions", "masked_lm_ids", "masked_lm_weights"],
shuffle=ds.Shuffle.FILES if do_shuffle == "true" else False,
num_shards=device_num, shard_id=rank, shard_equal_rows=False)
ori_dataset_size = data_set.get_dataset_size()
print('origin dataset size: ', ori_dataset_size) print('origin dataset size: ', ori_dataset_size)
type_cast_op = C.TypeCast(mstype.int32) type_cast_op = C.TypeCast(mstype.int32)
ds = ds.map(operations=type_cast_op, input_columns="masked_lm_ids")
ds = ds.map(operations=type_cast_op, input_columns="masked_lm_positions")
ds = ds.map(operations=type_cast_op, input_columns="next_sentence_labels")
ds = ds.map(operations=type_cast_op, input_columns="segment_ids")
ds = ds.map(operations=type_cast_op, input_columns="input_mask")
ds = ds.map(operations=type_cast_op, input_columns="input_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_positions")
data_set = data_set.map(operations=type_cast_op, input_columns="next_sentence_labels")
data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="input_mask")
data_set = data_set.map(operations=type_cast_op, input_columns="input_ids")
# apply batch operations # apply batch operations
ds = ds.batch(bert_net_cfg.batch_size, drop_remainder=True)
logger.info("data size: {}".format(ds.get_dataset_size()))
logger.info("repeat count: {}".format(ds.get_repeat_count()))
return ds
data_set = data_set.batch(bert_net_cfg.batch_size, drop_remainder=True)
logger.info("data size: {}".format(data_set.get_dataset_size()))
logger.info("repeat count: {}".format(data_set.get_repeat_count()))
return data_set




def create_ner_dataset(batch_size=1, repeat_count=1, assessment_method="accuracy", def create_ner_dataset(batch_size=1, repeat_count=1, assessment_method="accuracy",
data_file_path=None, schema_file_path=None): data_file_path=None, schema_file_path=None):
"""create finetune or evaluation dataset""" """create finetune or evaluation dataset"""
type_cast_op = C.TypeCast(mstype.int32) type_cast_op = C.TypeCast(mstype.int32)
ds = de.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None,
columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"])
data_set = ds.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None,
columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"])
if assessment_method == "Spearman_correlation": if assessment_method == "Spearman_correlation":
type_cast_op_float = C.TypeCast(mstype.float32) type_cast_op_float = C.TypeCast(mstype.float32)
ds = ds.map(operations=type_cast_op_float, input_columns="label_ids")
data_set = data_set.map(operations=type_cast_op_float, input_columns="label_ids")
else: else:
ds = ds.map(operations=type_cast_op, input_columns="label_ids")
ds = ds.map(operations=type_cast_op, input_columns="segment_ids")
ds = ds.map(operations=type_cast_op, input_columns="input_mask")
ds = ds.map(operations=type_cast_op, input_columns="input_ids")
ds = ds.repeat(repeat_count)
data_set = data_set.map(operations=type_cast_op, input_columns="label_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="input_mask")
data_set = data_set.map(operations=type_cast_op, input_columns="input_ids")
data_set = data_set.repeat(repeat_count)
# apply shuffle operation # apply shuffle operation
buffer_size = 960 buffer_size = 960
ds = ds.shuffle(buffer_size=buffer_size)
data_set = data_set.shuffle(buffer_size=buffer_size)
# apply batch operations # apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
return ds
data_set = data_set.batch(batch_size, drop_remainder=True)
return data_set




def create_classification_dataset(batch_size=1, repeat_count=1, assessment_method="accuracy", def create_classification_dataset(batch_size=1, repeat_count=1, assessment_method="accuracy",
data_file_path=None, schema_file_path=None): data_file_path=None, schema_file_path=None):
"""create finetune or evaluation dataset""" """create finetune or evaluation dataset"""
type_cast_op = C.TypeCast(mstype.int32) type_cast_op = C.TypeCast(mstype.int32)
ds = de.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None,
columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"])
data_set = ds.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None,
columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"])
if assessment_method == "Spearman_correlation": if assessment_method == "Spearman_correlation":
type_cast_op_float = C.TypeCast(mstype.float32) type_cast_op_float = C.TypeCast(mstype.float32)
ds = ds.map(operations=type_cast_op_float, input_columns="label_ids")
data_set = data_set.map(operations=type_cast_op_float, input_columns="label_ids")
else: else:
ds = ds.map(operations=type_cast_op, input_columns="label_ids")
ds = ds.map(operations=type_cast_op, input_columns="segment_ids")
ds = ds.map(operations=type_cast_op, input_columns="input_mask")
ds = ds.map(operations=type_cast_op, input_columns="input_ids")
ds = ds.repeat(repeat_count)
data_set = data_set.map(operations=type_cast_op, input_columns="label_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="input_mask")
data_set = data_set.map(operations=type_cast_op, input_columns="input_ids")
data_set = data_set.repeat(repeat_count)
# apply shuffle operation # apply shuffle operation
buffer_size = 960 buffer_size = 960
ds = ds.shuffle(buffer_size=buffer_size)
data_set = data_set.shuffle(buffer_size=buffer_size)
# apply batch operations # apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
return ds
data_set = data_set.batch(batch_size, drop_remainder=True)
return data_set




def create_squad_dataset(batch_size=1, repeat_count=1, data_file_path=None, schema_file_path=None, is_training=True): def create_squad_dataset(batch_size=1, repeat_count=1, data_file_path=None, schema_file_path=None, is_training=True):
"""create finetune or evaluation dataset""" """create finetune or evaluation dataset"""
type_cast_op = C.TypeCast(mstype.int32) type_cast_op = C.TypeCast(mstype.int32)
if is_training: if is_training:
ds = de.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None,
columns_list=["input_ids", "input_mask", "segment_ids",
"start_positions", "end_positions",
"unique_ids", "is_impossible"])
ds = ds.map(operations=type_cast_op, input_columns="start_positions")
ds = ds.map(operations=type_cast_op, input_columns="end_positions")
data_set = ds.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None,
columns_list=["input_ids", "input_mask", "segment_ids",
"start_positions", "end_positions",
"unique_ids", "is_impossible"])
data_set = data_set.map(operations=type_cast_op, input_columns="start_positions")
data_set = data_set.map(operations=type_cast_op, input_columns="end_positions")
else: else:
ds = de.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None,
columns_list=["input_ids", "input_mask", "segment_ids", "unique_ids"])
ds = ds.map(operations=type_cast_op, input_columns="input_ids")
ds = ds.map(operations=type_cast_op, input_columns="input_mask")
ds = ds.map(operations=type_cast_op, input_columns="segment_ids")
ds = ds.map(operations=type_cast_op, input_columns="segment_ids")
ds = ds.map(operations=type_cast_op, input_columns="input_mask")
ds = ds.map(operations=type_cast_op, input_columns="input_ids")
ds = ds.repeat(repeat_count)
data_set = ds.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None,
columns_list=["input_ids", "input_mask", "segment_ids", "unique_ids"])
data_set = data_set.map(operations=type_cast_op, input_columns="input_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="input_mask")
data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="input_mask")
data_set = data_set.map(operations=type_cast_op, input_columns="input_ids")
data_set = data_set.repeat(repeat_count)
# apply shuffle operation # apply shuffle operation
buffer_size = 960 buffer_size = 960
ds = ds.shuffle(buffer_size=buffer_size)
data_set = data_set.shuffle(buffer_size=buffer_size)
# apply batch operations # apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
return ds
data_set = data_set.batch(batch_size, drop_remainder=True)
return data_set

+ 7
- 7
model_zoo/official/nlp/fasttext/eval.py View File

@@ -22,7 +22,7 @@ import mindspore.ops.operations as P
from mindspore.common.tensor import Tensor from mindspore.common.tensor import Tensor
from mindspore.train.model import Model from mindspore.train.model import Model
from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore.train.serialization import load_checkpoint, load_param_into_net
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.transforms.c_transforms as deC import mindspore.dataset.transforms.c_transforms as deC
from mindspore import context from mindspore import context
from src.fasttext_model import FastText from src.fasttext_model import FastText
@@ -73,15 +73,15 @@ class FastTextInferCell(nn.Cell):
def load_infer_dataset(batch_size, datafile): def load_infer_dataset(batch_size, datafile):
"""data loader for infer""" """data loader for infer"""
ds = de.MindDataset(datafile, columns_list=['src_tokens', 'src_tokens_length', 'label_idx'])
data_set = ds.MindDataset(datafile, columns_list=['src_tokens', 'src_tokens_length', 'label_idx'])
type_cast_op = deC.TypeCast(mstype.int32) type_cast_op = deC.TypeCast(mstype.int32)
ds = ds.map(operations=type_cast_op, input_columns="src_tokens")
ds = ds.map(operations=type_cast_op, input_columns="src_tokens_length")
ds = ds.map(operations=type_cast_op, input_columns="label_idx")
ds = ds.batch(batch_size=batch_size, drop_remainder=True)
data_set = data_set.map(operations=type_cast_op, input_columns="src_tokens")
data_set = data_set.map(operations=type_cast_op, input_columns="src_tokens_length")
data_set = data_set.map(operations=type_cast_op, input_columns="label_idx")
data_set = data_set.batch(batch_size=batch_size, drop_remainder=True)
return ds
return data_set
def run_fasttext_infer(): def run_fasttext_infer():
"""run infer with FastText""" """run infer with FastText"""


+ 3
- 3
model_zoo/official/nlp/fasttext/src/dataset.py View File

@@ -25,8 +25,10 @@ import spacy
from sklearn.feature_extraction import FeatureHasher from sklearn.feature_extraction import FeatureHasher
from mindspore.mindrecord import FileWriter from mindspore.mindrecord import FileWriter
class FastTextDataPreProcess(): class FastTextDataPreProcess():
"""FastText data preprocess""" """FastText data preprocess"""
def __init__(self, train_path, def __init__(self, train_path,
test_file, test_file,
max_length, max_length,
@@ -194,7 +196,6 @@ class FastTextDataPreProcess():
if self.text_less in sent_describe and self.text_greater in sent_describe: if self.text_less in sent_describe and self.text_greater in sent_describe:
sent_describe = self.str_html.sub('', sent_describe) sent_describe = self.str_html.sub('', sent_describe)
doc = spacy_nlp(sent_describe) doc = spacy_nlp(sent_describe)
bows_token = [token.text for token in doc] bows_token = [token.text for token in doc]
@@ -222,7 +223,7 @@ class FastTextDataPreProcess():
def _get_bucket_length(self, x, bts): def _get_bucket_length(self, x, bts):
x_len = len(x) x_len = len(x)
for index in range(1, len(bts)): for index in range(1, len(bts)):
if bts[index-1] < x_len <= bts[index]:
if bts[index - 1] < x_len <= bts[index]:
return bts[index] return bts[index]
return bts[0] return bts[0]
@@ -310,7 +311,6 @@ if __name__ == '__main__':
print("Writing test data to MindRecord file.....") print("Writing test data to MindRecord file.....")
for k in args.test_bucket: for k in args.test_bucket:
write_to_mindrecord(test_data_example[k], './test_dataset_bs_' + str(k) + '.mindrecord', 1) write_to_mindrecord(test_data_example[k], './test_dataset_bs_' + str(k) + '.mindrecord', 1)
print("All done.....") print("All done.....")

+ 25
- 22
model_zoo/official/nlp/fasttext/src/load_dataset.py View File

@@ -14,9 +14,10 @@
# ============================================================================ # ============================================================================
"""FastText data loader""" """FastText data loader"""
import mindspore.common.dtype as mstype import mindspore.common.dtype as mstype
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.transforms.c_transforms as deC import mindspore.dataset.transforms.c_transforms as deC
def load_dataset(dataset_path, def load_dataset(dataset_path,
batch_size, batch_size,
epoch_count=1, epoch_count=1,
@@ -25,38 +26,40 @@ def load_dataset(dataset_path,
bucket=None, bucket=None,
shuffle=True): shuffle=True):
"""dataset loader""" """dataset loader"""
def batch_per_bucket(bucket_length, input_file): def batch_per_bucket(bucket_length, input_file):
input_file = input_file +'/train_dataset_bs_' + str(bucket_length) + '.mindrecord'
input_file = input_file + '/train_dataset_bs_' + str(bucket_length) + '.mindrecord'
if not input_file: if not input_file:
raise FileNotFoundError("input file parameter must not be empty.") raise FileNotFoundError("input file parameter must not be empty.")
ds = de.MindDataset(input_file,
columns_list=['src_tokens', 'src_tokens_length', 'label_idx'],
shuffle=shuffle,
num_shards=rank_size,
shard_id=rank_id,
num_parallel_workers=8)
ori_dataset_size = ds.get_dataset_size()
data_set = ds.MindDataset(input_file,
columns_list=['src_tokens', 'src_tokens_length', 'label_idx'],
shuffle=shuffle,
num_shards=rank_size,
shard_id=rank_id,
num_parallel_workers=8)
ori_dataset_size = data_set.get_dataset_size()
print(f"Dataset size: {ori_dataset_size}") print(f"Dataset size: {ori_dataset_size}")
repeat_count = epoch_count repeat_count = epoch_count
type_cast_op = deC.TypeCast(mstype.int32) type_cast_op = deC.TypeCast(mstype.int32)
ds = ds.map(operations=type_cast_op, input_columns="src_tokens")
ds = ds.map(operations=type_cast_op, input_columns="src_tokens_length")
ds = ds.map(operations=type_cast_op, input_columns="label_idx")
data_set = data_set.map(operations=type_cast_op, input_columns="src_tokens")
data_set = data_set.map(operations=type_cast_op, input_columns="src_tokens_length")
data_set = data_set.map(operations=type_cast_op, input_columns="label_idx")
data_set = data_set.rename(input_columns=['src_tokens', 'src_tokens_length', 'label_idx'],
output_columns=['src_token_text', 'src_tokens_text_length', 'label_idx_tag'])
data_set = data_set.batch(batch_size, drop_remainder=False)
data_set = data_set.repeat(repeat_count)
return data_set
ds = ds.rename(input_columns=['src_tokens', 'src_tokens_length', 'label_idx'],
output_columns=['src_token_text', 'src_tokens_text_length', 'label_idx_tag'])
ds = ds.batch(batch_size, drop_remainder=False)
ds = ds.repeat(repeat_count)
return ds
for i, _ in enumerate(bucket): for i, _ in enumerate(bucket):
bucket_len = bucket[i] bucket_len = bucket[i]
ds_per = batch_per_bucket(bucket_len, dataset_path) ds_per = batch_per_bucket(bucket_len, dataset_path)
if i == 0: if i == 0:
ds = ds_per
data_set = ds_per
else: else:
ds = ds + ds_per
ds = ds.shuffle(ds.get_dataset_size())
ds.channel_name = 'fasttext'
data_set = data_set + ds_per
data_set = data_set.shuffle(data_set.get_dataset_size())
data_set.channel_name = 'fasttext'
return ds
return data_set

+ 19
- 19
model_zoo/official/nlp/gnmt_v2/src/dataset/load_dataset.py View File

@@ -15,7 +15,7 @@
"""Dataset loader to feed into model.""" """Dataset loader to feed into model."""
import os import os
import mindspore.common.dtype as mstype import mindspore.common.dtype as mstype
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.transforms.c_transforms as deC import mindspore.dataset.transforms.c_transforms as deC




@@ -55,7 +55,7 @@ def _load_dataset(input_files, schema_file, batch_size, sink_mode=False,
print(f" | Loading {datafile}.") print(f" | Loading {datafile}.")


if not is_translate: if not is_translate:
ds = de.MindDataset(
data_set = ds.MindDataset(
input_files, columns_list=[ input_files, columns_list=[
"src", "src_padding", "src", "src_padding",
"prev_opt", "prev_opt",
@@ -64,18 +64,18 @@ def _load_dataset(input_files, schema_file, batch_size, sink_mode=False,
num_parallel_workers=8 num_parallel_workers=8
) )


ori_dataset_size = ds.get_dataset_size()
ori_dataset_size = data_set.get_dataset_size()
print(f" | Dataset size: {ori_dataset_size}.") print(f" | Dataset size: {ori_dataset_size}.")
if shuffle: if shuffle:
ds = ds.shuffle(buffer_size=ori_dataset_size // 20)
data_set = data_set.shuffle(buffer_size=ori_dataset_size // 20)
type_cast_op = deC.TypeCast(mstype.int32) type_cast_op = deC.TypeCast(mstype.int32)
ds = ds.map(input_columns="src", operations=type_cast_op, num_parallel_workers=8)
ds = ds.map(input_columns="src_padding", operations=type_cast_op, num_parallel_workers=8)
ds = ds.map(input_columns="prev_opt", operations=type_cast_op, num_parallel_workers=8)
ds = ds.map(input_columns="target", operations=type_cast_op, num_parallel_workers=8)
ds = ds.map(input_columns="tgt_padding", operations=type_cast_op, num_parallel_workers=8)
data_set = data_set.map(input_columns="src", operations=type_cast_op, num_parallel_workers=8)
data_set = data_set.map(input_columns="src_padding", operations=type_cast_op, num_parallel_workers=8)
data_set = data_set.map(input_columns="prev_opt", operations=type_cast_op, num_parallel_workers=8)
data_set = data_set.map(input_columns="target", operations=type_cast_op, num_parallel_workers=8)
data_set = data_set.map(input_columns="tgt_padding", operations=type_cast_op, num_parallel_workers=8)


ds = ds.rename(
data_set = data_set.rename(
input_columns=["src", input_columns=["src",
"src_padding", "src_padding",
"prev_opt", "prev_opt",
@@ -87,9 +87,9 @@ def _load_dataset(input_files, schema_file, batch_size, sink_mode=False,
"target_eos_ids", "target_eos_ids",
"target_eos_mask"] "target_eos_mask"]
) )
ds = ds.batch(batch_size, drop_remainder=drop_remainder)
data_set = data_set.batch(batch_size, drop_remainder=drop_remainder)
else: else:
ds = de.MindDataset(
data_set = ds.MindDataset(
input_files, columns_list=[ input_files, columns_list=[
"src", "src_padding" "src", "src_padding"
], ],
@@ -97,23 +97,23 @@ def _load_dataset(input_files, schema_file, batch_size, sink_mode=False,
num_parallel_workers=8 num_parallel_workers=8
) )


ori_dataset_size = ds.get_dataset_size()
ori_dataset_size = data_set.get_dataset_size()
print(f" | Dataset size: {ori_dataset_size}.") print(f" | Dataset size: {ori_dataset_size}.")
if shuffle: if shuffle:
ds = ds.shuffle(buffer_size=ori_dataset_size // 20)
data_set = data_set.shuffle(buffer_size=ori_dataset_size // 20)
type_cast_op = deC.TypeCast(mstype.int32) type_cast_op = deC.TypeCast(mstype.int32)
ds = ds.map(input_columns="src", operations=type_cast_op, num_parallel_workers=8)
ds = ds.map(input_columns="src_padding", operations=type_cast_op, num_parallel_workers=8)
data_set = data_set.map(input_columns="src", operations=type_cast_op, num_parallel_workers=8)
data_set = data_set.map(input_columns="src_padding", operations=type_cast_op, num_parallel_workers=8)


ds = ds.rename(
data_set = data_set.rename(
input_columns=["src", input_columns=["src",
"src_padding"], "src_padding"],
output_columns=["source_eos_ids", output_columns=["source_eos_ids",
"source_eos_mask"] "source_eos_mask"]
) )
ds = ds.batch(batch_size, drop_remainder=drop_remainder)
data_set = data_set.batch(batch_size, drop_remainder=drop_remainder)


return ds
return data_set




def load_dataset(data_files: list, schema: str, batch_size: int, sink_mode: bool, def load_dataset(data_files: list, schema: str, batch_size: int, sink_mode: bool,


+ 15
- 15
model_zoo/official/nlp/mass/src/dataset/load_dataset.py View File

@@ -14,7 +14,7 @@
# ============================================================================ # ============================================================================
"""Dataset loader to feed into model.""" """Dataset loader to feed into model."""
import mindspore.common.dtype as mstype import mindspore.common.dtype as mstype
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.transforms.c_transforms as deC import mindspore.dataset.transforms.c_transforms as deC




@@ -45,7 +45,7 @@ def _load_dataset(input_files, batch_size, epoch_count=1,
for datafile in input_files: for datafile in input_files:
print(f" | Loading {datafile}.") print(f" | Loading {datafile}.")


ds = de.TFRecordDataset(
data_set = ds.TFRecordDataset(
input_files, input_files,
columns_list=[ columns_list=[
"src", "src_padding", "src", "src_padding",
@@ -55,19 +55,19 @@ def _load_dataset(input_files, batch_size, epoch_count=1,
shuffle=shuffle, num_shards=rank_size, shard_id=rank_id, shuffle=shuffle, num_shards=rank_size, shard_id=rank_id,
shard_equal_rows=True, num_parallel_workers=8) shard_equal_rows=True, num_parallel_workers=8)


ori_dataset_size = ds.get_dataset_size()
ori_dataset_size = data_set.get_dataset_size()
print(f" | Dataset size: {ori_dataset_size}.") print(f" | Dataset size: {ori_dataset_size}.")
repeat_count = epoch_count repeat_count = epoch_count


type_cast_op = deC.TypeCast(mstype.int32) type_cast_op = deC.TypeCast(mstype.int32)
ds = ds.map(operations=type_cast_op, input_columns="src")
ds = ds.map(operations=type_cast_op, input_columns="src_padding")
ds = ds.map(operations=type_cast_op, input_columns="prev_opt")
ds = ds.map(operations=type_cast_op, input_columns="prev_padding")
ds = ds.map(operations=type_cast_op, input_columns="target")
ds = ds.map(operations=type_cast_op, input_columns="tgt_padding")
ds = ds.rename(
data_set = data_set.map(operations=type_cast_op, input_columns="src")
data_set = data_set.map(operations=type_cast_op, input_columns="src_padding")
data_set = data_set.map(operations=type_cast_op, input_columns="prev_opt")
data_set = data_set.map(operations=type_cast_op, input_columns="prev_padding")
data_set = data_set.map(operations=type_cast_op, input_columns="target")
data_set = data_set.map(operations=type_cast_op, input_columns="tgt_padding")
data_set = data_set.rename(
input_columns=["src", input_columns=["src",
"src_padding", "src_padding",
"prev_opt", "prev_opt",
@@ -82,11 +82,11 @@ def _load_dataset(input_files, batch_size, epoch_count=1,
"target_eos_mask"] "target_eos_mask"]
) )


ds = ds.batch(batch_size, drop_remainder=True)
ds = ds.repeat(repeat_count)
data_set = data_set.batch(batch_size, drop_remainder=True)
data_set = data_set.repeat(repeat_count)


ds.channel_name = 'transformer'
return ds
data_set.channel_name = 'transformer'
return data_set




def load_dataset(data_files: list, batch_size: int, epoch_count: int, def load_dataset(data_files: list, batch_size: int, epoch_count: int,


+ 15
- 15
model_zoo/official/nlp/prophetnet/src/dataset/load_dataset.py View File

@@ -14,7 +14,7 @@
# ============================================================================ # ============================================================================
"""Dataset loader to feed into model.""" """Dataset loader to feed into model."""
import mindspore.common.dtype as mstype import mindspore.common.dtype as mstype
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.transforms.c_transforms as deC import mindspore.dataset.transforms.c_transforms as deC




@@ -45,7 +45,7 @@ def _load_dataset(input_files, batch_size, epoch_count=1,
for datafile in input_files: for datafile in input_files:
print(f" | Loading {datafile}.") print(f" | Loading {datafile}.")


ds = de.TFRecordDataset(
data_set = ds.TFRecordDataset(
input_files, input_files,
columns_list=[ columns_list=[
"src", "src_padding", "src", "src_padding",
@@ -55,19 +55,19 @@ def _load_dataset(input_files, batch_size, epoch_count=1,
shuffle=shuffle, num_shards=rank_size, shard_id=rank_id, shuffle=shuffle, num_shards=rank_size, shard_id=rank_id,
shard_equal_rows=True, num_parallel_workers=8) shard_equal_rows=True, num_parallel_workers=8)


ori_dataset_size = ds.get_dataset_size()
ori_dataset_size = data_set.get_dataset_size()
print(f" | Dataset size: {ori_dataset_size}.") print(f" | Dataset size: {ori_dataset_size}.")
repeat_count = epoch_count repeat_count = epoch_count


type_cast_op = deC.TypeCast(mstype.int32) type_cast_op = deC.TypeCast(mstype.int32)
ds = ds.map(input_columns="src", operations=type_cast_op)
ds = ds.map(input_columns="src_padding", operations=type_cast_op)
ds = ds.map(input_columns="prev_opt", operations=type_cast_op)
ds = ds.map(input_columns="prev_padding", operations=type_cast_op)
ds = ds.map(input_columns="target", operations=type_cast_op)
ds = ds.map(input_columns="tgt_padding", operations=type_cast_op)
ds = ds.rename(
data_set = data_set.map(input_columns="src", operations=type_cast_op)
data_set = data_set.map(input_columns="src_padding", operations=type_cast_op)
data_set = data_set.map(input_columns="prev_opt", operations=type_cast_op)
data_set = data_set.map(input_columns="prev_padding", operations=type_cast_op)
data_set = data_set.map(input_columns="target", operations=type_cast_op)
data_set = data_set.map(input_columns="tgt_padding", operations=type_cast_op)
data_set = data_set.rename(
input_columns=["src", input_columns=["src",
"src_padding", "src_padding",
"prev_opt", "prev_opt",
@@ -82,11 +82,11 @@ def _load_dataset(input_files, batch_size, epoch_count=1,
"target_eos_mask"] "target_eos_mask"]
) )


ds = ds.batch(batch_size, drop_remainder=True)
ds = ds.repeat(repeat_count)
data_set = data_set.batch(batch_size, drop_remainder=True)
data_set = data_set.repeat(repeat_count)


ds.channel_name = 'transformer'
return ds
data_set.channel_name = 'transformer'
return data_set




def load_dataset(data_files: list, batch_size: int, epoch_count: int, def load_dataset(data_files: list, batch_size: int, epoch_count: int,


+ 15
- 13
model_zoo/official/nlp/tinybert/src/dataset.py View File

@@ -18,14 +18,16 @@
import os import os
from enum import Enum from enum import Enum
import mindspore.common.dtype as mstype import mindspore.common.dtype as mstype
import mindspore.dataset.engine.datasets as de
import mindspore.dataset as ds
import mindspore.dataset.transforms.c_transforms as C import mindspore.dataset.transforms.c_transforms as C



class DataType(Enum): class DataType(Enum):
"""Enumerate supported dataset format""" """Enumerate supported dataset format"""
TFRECORD = 1 TFRECORD = 1
MINDRECORD = 2 MINDRECORD = 2



def create_tinybert_dataset(task='td', batch_size=32, device_num=1, rank=0, def create_tinybert_dataset(task='td', batch_size=32, device_num=1, rank=0,
do_shuffle="true", data_dir=None, schema_dir=None, do_shuffle="true", data_dir=None, schema_dir=None,
data_type=DataType.TFRECORD): data_type=DataType.TFRECORD):
@@ -47,22 +49,22 @@ def create_tinybert_dataset(task='td', batch_size=32, device_num=1, rank=0,
shuffle = False shuffle = False


if data_type == DataType.MINDRECORD: if data_type == DataType.MINDRECORD:
ds = de.MindDataset(data_files, columns_list=columns_list,
shuffle=(do_shuffle == "true"), num_shards=device_num, shard_id=rank)
data_set = ds.MindDataset(data_files, columns_list=columns_list,
shuffle=(do_shuffle == "true"), num_shards=device_num, shard_id=rank)
else: else:
ds = de.TFRecordDataset(data_files, schema_dir, columns_list=columns_list,
shuffle=shuffle, num_shards=device_num, shard_id=rank,
shard_equal_rows=shard_equal_rows)
data_set = ds.TFRecordDataset(data_files, schema_dir, columns_list=columns_list,
shuffle=shuffle, num_shards=device_num, shard_id=rank,
shard_equal_rows=shard_equal_rows)
if device_num == 1 and shuffle is True: if device_num == 1 and shuffle is True:
ds = ds.shuffle(10000)
data_set = data_set.shuffle(10000)


type_cast_op = C.TypeCast(mstype.int32) type_cast_op = C.TypeCast(mstype.int32)
ds = ds.map(operations=type_cast_op, input_columns="segment_ids")
ds = ds.map(operations=type_cast_op, input_columns="input_mask")
ds = ds.map(operations=type_cast_op, input_columns="input_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="input_mask")
data_set = data_set.map(operations=type_cast_op, input_columns="input_ids")
if task == "td": if task == "td":
ds = ds.map(operations=type_cast_op, input_columns="label_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="label_ids")
# apply batch operations # apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
data_set = data_set.batch(batch_size, drop_remainder=True)


return ds
return data_set

+ 21
- 15
model_zoo/official/nlp/transformer/eval.py View File

@@ -23,38 +23,41 @@ from mindspore.common.parameter import Parameter
from mindspore.common.tensor import Tensor from mindspore.common.tensor import Tensor
from mindspore.train.model import Model from mindspore.train.model import Model
from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore.train.serialization import load_checkpoint, load_param_into_net
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.transforms.c_transforms as deC import mindspore.dataset.transforms.c_transforms as deC
from mindspore import context from mindspore import context


from src.transformer_model import TransformerModel from src.transformer_model import TransformerModel
from src.eval_config import cfg, transformer_net_cfg from src.eval_config import cfg, transformer_net_cfg



def load_test_data(batch_size=1, data_file=None): def load_test_data(batch_size=1, data_file=None):
""" """
Load test dataset Load test dataset
""" """
ds = de.MindDataset(data_file,
columns_list=["source_eos_ids", "source_eos_mask",
"target_sos_ids", "target_sos_mask",
"target_eos_ids", "target_eos_mask"],
shuffle=False)
data_set = ds.MindDataset(data_file,
columns_list=["source_eos_ids", "source_eos_mask",
"target_sos_ids", "target_sos_mask",
"target_eos_ids", "target_eos_mask"],
shuffle=False)
type_cast_op = deC.TypeCast(mstype.int32) type_cast_op = deC.TypeCast(mstype.int32)
ds = ds.map(operations=type_cast_op, input_columns="source_eos_ids")
ds = ds.map(operations=type_cast_op, input_columns="source_eos_mask")
ds = ds.map(operations=type_cast_op, input_columns="target_sos_ids")
ds = ds.map(operations=type_cast_op, input_columns="target_sos_mask")
ds = ds.map(operations=type_cast_op, input_columns="target_eos_ids")
ds = ds.map(operations=type_cast_op, input_columns="target_eos_mask")
data_set = data_set.map(operations=type_cast_op, input_columns="source_eos_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="source_eos_mask")
data_set = data_set.map(operations=type_cast_op, input_columns="target_sos_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="target_sos_mask")
data_set = data_set.map(operations=type_cast_op, input_columns="target_eos_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="target_eos_mask")
# apply batch operations # apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
ds.channel_name = 'transformer'
return ds
data_set = data_set.batch(batch_size, drop_remainder=True)
data_set.channel_name = 'transformer'
return data_set



class TransformerInferCell(nn.Cell): class TransformerInferCell(nn.Cell):
""" """
Encapsulation class of transformer network infer. Encapsulation class of transformer network infer.
""" """

def __init__(self, network): def __init__(self, network):
super(TransformerInferCell, self).__init__(auto_prefix=False) super(TransformerInferCell, self).__init__(auto_prefix=False)
self.network = network self.network = network
@@ -65,6 +68,7 @@ class TransformerInferCell(nn.Cell):
predicted_ids = self.network(source_ids, source_mask) predicted_ids = self.network(source_ids, source_mask)
return predicted_ids return predicted_ids



def load_weights(model_path): def load_weights(model_path):
""" """
Load checkpoint as parameter dict, support both npz file and mindspore checkpoint file. Load checkpoint as parameter dict, support both npz file and mindspore checkpoint file.
@@ -93,6 +97,7 @@ def load_weights(model_path):
parameter_dict[name] = Parameter(Tensor(weights[name]), name=name) parameter_dict[name] = Parameter(Tensor(weights[name]), name=name)
return parameter_dict return parameter_dict



def run_transformer_eval(): def run_transformer_eval():
""" """
Transformer evaluation. Transformer evaluation.
@@ -136,5 +141,6 @@ def run_transformer_eval():
f.write(" ".join(token_ids) + "\n") f.write(" ".join(token_ids) + "\n")
f.close() f.close()



if __name__ == "__main__": if __name__ == "__main__":
run_transformer_eval() run_transformer_eval()

+ 36
- 36
model_zoo/official/recommend/deepfm/src/dataset.py View File

@@ -21,7 +21,7 @@ from enum import Enum


import numpy as np import numpy as np
import pandas as pd import pandas as pd
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.common.dtype as mstype import mindspore.common.dtype as mstype


from .config import DataConfig from .config import DataConfig
@@ -142,8 +142,8 @@ class H5Dataset():
X_id = X[:, 0:self.max_length] X_id = X[:, 0:self.max_length]
X_va = X[:, self.max_length:] X_va = X[:, self.max_length:]
yield np.array(X_id.astype(dtype=np.int32)), \ yield np.array(X_id.astype(dtype=np.int32)), \
np.array(X_va.astype(dtype=np.float32)), \
np.array(y.astype(dtype=np.float32))
np.array(X_va.astype(dtype=np.float32)), \
np.array(y.astype(dtype=np.float32))




def _get_h5_dataset(directory, train_mode=True, epochs=1, batch_size=1000): def _get_h5_dataset(directory, train_mode=True, epochs=1, batch_size=1000):
@@ -172,9 +172,9 @@ def _get_h5_dataset(directory, train_mode=True, epochs=1, batch_size=1000):
for _ in range(0, numbers_of_batch, 1): for _ in range(0, numbers_of_batch, 1):
yield train_eval_gen.__next__() yield train_eval_gen.__next__()


ds = de.GeneratorDataset(_iter_h5_data, ["ids", "weights", "labels"])
ds = ds.repeat(epochs)
return ds
data_set = ds.GeneratorDataset(_iter_h5_data, ["ids", "weights", "labels"])
data_set = data_set.repeat(epochs)
return data_set




def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=1000, def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=1000,
@@ -199,23 +199,23 @@ def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=100
shuffle = train_mode shuffle = train_mode


if rank_size is not None and rank_id is not None: if rank_size is not None and rank_id is not None:
ds = de.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name),
columns_list=['feat_ids', 'feat_vals', 'label'],
num_shards=rank_size, shard_id=rank_id, shuffle=shuffle,
num_parallel_workers=8)
data_set = ds.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name),
columns_list=['feat_ids', 'feat_vals', 'label'],
num_shards=rank_size, shard_id=rank_id, shuffle=shuffle,
num_parallel_workers=8)
else: else:
ds = de.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name),
columns_list=['feat_ids', 'feat_vals', 'label'],
shuffle=shuffle, num_parallel_workers=8)
ds = ds.batch(int(batch_size / line_per_sample), drop_remainder=True)
ds = ds.map(operations=(lambda x, y, z: (np.array(x).flatten().reshape(batch_size, 39),
np.array(y).flatten().reshape(batch_size, 39),
np.array(z).flatten().reshape(batch_size, 1))),
input_columns=['feat_ids', 'feat_vals', 'label'],
column_order=['feat_ids', 'feat_vals', 'label'],
num_parallel_workers=8)
ds = ds.repeat(epochs)
return ds
data_set = ds.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name),
columns_list=['feat_ids', 'feat_vals', 'label'],
shuffle=shuffle, num_parallel_workers=8)
data_set = data_set.batch(int(batch_size / line_per_sample), drop_remainder=True)
data_set = data_set.map(operations=(lambda x, y, z: (np.array(x).flatten().reshape(batch_size, 39),
np.array(y).flatten().reshape(batch_size, 39),
np.array(z).flatten().reshape(batch_size, 1))),
input_columns=['feat_ids', 'feat_vals', 'label'],
column_order=['feat_ids', 'feat_vals', 'label'],
num_parallel_workers=8)
data_set = data_set.repeat(epochs)
return data_set




def _get_tf_dataset(directory, train_mode=True, epochs=1, batch_size=1000, def _get_tf_dataset(directory, train_mode=True, epochs=1, batch_size=1000,
@@ -242,28 +242,28 @@ def _get_tf_dataset(directory, train_mode=True, epochs=1, batch_size=1000,
for filename in filenames: for filename in filenames:
if file_prefixt_name in filename and 'tfrecord' in filename: if file_prefixt_name in filename and 'tfrecord' in filename:
dataset_files.append(os.path.join(dir_path, filename)) dataset_files.append(os.path.join(dir_path, filename))
schema = de.Schema()
schema = ds.Schema()
schema.add_column('feat_ids', de_type=mstype.int32) schema.add_column('feat_ids', de_type=mstype.int32)
schema.add_column('feat_vals', de_type=mstype.float32) schema.add_column('feat_vals', de_type=mstype.float32)
schema.add_column('label', de_type=mstype.float32) schema.add_column('label', de_type=mstype.float32)
if rank_size is not None and rank_id is not None: if rank_size is not None and rank_id is not None:
ds = de.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle,
schema=schema, num_parallel_workers=8,
num_shards=rank_size, shard_id=rank_id,
shard_equal_rows=True)
data_set = ds.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle,
schema=schema, num_parallel_workers=8,
num_shards=rank_size, shard_id=rank_id,
shard_equal_rows=True)
else: else:
ds = de.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle,
schema=schema, num_parallel_workers=8)
ds = ds.batch(int(batch_size / line_per_sample), drop_remainder=True)
ds = ds.map(operations=(lambda x, y, z: (
data_set = ds.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle,
schema=schema, num_parallel_workers=8)
data_set = data_set.batch(int(batch_size / line_per_sample), drop_remainder=True)
data_set = data_set.map(operations=(lambda x, y, z: (
np.array(x).flatten().reshape(batch_size, 39), np.array(x).flatten().reshape(batch_size, 39),
np.array(y).flatten().reshape(batch_size, 39), np.array(y).flatten().reshape(batch_size, 39),
np.array(z).flatten().reshape(batch_size, 1))), np.array(z).flatten().reshape(batch_size, 1))),
input_columns=['feat_ids', 'feat_vals', 'label'],
column_order=['feat_ids', 'feat_vals', 'label'],
num_parallel_workers=8)
ds = ds.repeat(epochs)
return ds
input_columns=['feat_ids', 'feat_vals', 'label'],
column_order=['feat_ids', 'feat_vals', 'label'],
num_parallel_workers=8)
data_set = data_set.repeat(epochs)
return data_set




def create_dataset(directory, train_mode=True, epochs=1, batch_size=1000, def create_dataset(directory, train_mode=True, epochs=1, batch_size=1000,


+ 46
- 48
model_zoo/official/recommend/wide_and_deep/src/datasets.py View File

@@ -14,13 +14,12 @@
# ============================================================================ # ============================================================================
"""train_dataset.""" """train_dataset."""



import os import os
import math import math
from enum import Enum from enum import Enum
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.common.dtype as mstype import mindspore.common.dtype as mstype




@@ -84,9 +83,9 @@ class H5Dataset():
yield os.path.join(self._hdf_data_dir, yield os.path.join(self._hdf_data_dir,
self._file_prefix + '_input_part_' + str( self._file_prefix + '_input_part_' + str(
p) + '.h5'), \ p) + '.h5'), \
os.path.join(self._hdf_data_dir,
self._file_prefix + '_output_part_' + str(
p) + '.h5'), i + 1 == len(parts)
os.path.join(self._hdf_data_dir,
self._file_prefix + '_output_part_' + str(
p) + '.h5'), i + 1 == len(parts)


def _generator(self, X, y, batch_size, shuffle=True): def _generator(self, X, y, batch_size, shuffle=True):
""" """
@@ -106,8 +105,7 @@ class H5Dataset():
np.random.shuffle(sample_index) np.random.shuffle(sample_index)
assert X.shape[0] > 0 assert X.shape[0] > 0
while True: while True:
batch_index = sample_index[
batch_size * counter: batch_size * (counter + 1)]
batch_index = sample_index[batch_size * counter: batch_size * (counter + 1)]
X_batch = X[batch_index] X_batch = X[batch_index]
y_batch = y[batch_index] y_batch = y[batch_index]
counter += 1 counter += 1
@@ -140,9 +138,8 @@ class H5Dataset():
X, y, finished = data_gen.__next__() X, y, finished = data_gen.__next__()
X_id = X[:, 0:self.input_length] X_id = X[:, 0:self.input_length]
X_va = X[:, self.input_length:] X_va = X[:, self.input_length:]
yield np.array(X_id.astype(dtype=np.int32)), np.array(
X_va.astype(dtype=np.float32)), np.array(
y.astype(dtype=np.float32))
yield np.array(X_id.astype(dtype=np.int32)), np.array(X_va.astype(dtype=np.float32)), np.array(
y.astype(dtype=np.float32))




def _get_h5_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000): def _get_h5_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000):
@@ -164,9 +161,9 @@ def _get_h5_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000):
for _ in range(0, numbers_of_batch, 1): for _ in range(0, numbers_of_batch, 1):
yield train_eval_gen.__next__() yield train_eval_gen.__next__()


ds = de.GeneratorDataset(_iter_h5_data(), ["ids", "weights", "labels"])
ds = ds.repeat(epochs)
return ds
data_set = ds.GeneratorDataset(_iter_h5_data(), ["ids", "weights", "labels"])
data_set = data_set.repeat(epochs)
return data_set




def _padding_func(batch_size, manual_shape, target_column, field_size=39): def _padding_func(batch_size, manual_shape, target_column, field_size=39):
@@ -174,11 +171,11 @@ def _padding_func(batch_size, manual_shape, target_column, field_size=39):
get padding_func get padding_func
""" """
if manual_shape: if manual_shape:
generate_concat_offset = [item[0]+item[1] for item in manual_shape]
generate_concat_offset = [item[0] + item[1] for item in manual_shape]
part_size = int(target_column / len(generate_concat_offset)) part_size = int(target_column / len(generate_concat_offset))
filled_value = [] filled_value = []
for i in range(field_size, target_column): for i in range(field_size, target_column):
filled_value.append(generate_concat_offset[i//part_size]-1)
filled_value.append(generate_concat_offset[i // part_size] - 1)
print("Filed Value:", filled_value) print("Filed Value:", filled_value)


def padding_func(x, y, z): def padding_func(x, y, z):
@@ -190,7 +187,7 @@ def _padding_func(batch_size, manual_shape, target_column, field_size=39):
dtype=np.int32) * filled_value dtype=np.int32) * filled_value
x_id = np.concatenate([x, x_id.astype(dtype=np.int32)], axis=1) x_id = np.concatenate([x, x_id.astype(dtype=np.int32)], axis=1)
mask = np.concatenate( mask = np.concatenate(
[y, np.zeros((batch_size, target_column-39), dtype=np.float32)], axis=1)
[y, np.zeros((batch_size, target_column - 39), dtype=np.float32)], axis=1)
return (x_id, mask, z) return (x_id, mask, z)
else: else:
def padding_func(x, y, z): def padding_func(x, y, z):
@@ -214,24 +211,25 @@ def _get_tf_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000,
for filename in filenames: for filename in filenames:
if file_prefix_name in filename and "tfrecord" in filename: if file_prefix_name in filename and "tfrecord" in filename:
dataset_files.append(os.path.join(dirpath, filename)) dataset_files.append(os.path.join(dirpath, filename))
schema = de.Schema()
schema = ds.Schema()
schema.add_column('feat_ids', de_type=mstype.int32) schema.add_column('feat_ids', de_type=mstype.int32)
schema.add_column('feat_vals', de_type=mstype.float32) schema.add_column('feat_vals', de_type=mstype.float32)
schema.add_column('label', de_type=mstype.float32) schema.add_column('label', de_type=mstype.float32)
if rank_size is not None and rank_id is not None: if rank_size is not None and rank_id is not None:
ds = de.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, schema=schema, num_parallel_workers=8,
num_shards=rank_size, shard_id=rank_id, shard_equal_rows=True)
data_set = ds.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, schema=schema,
num_parallel_workers=8,
num_shards=rank_size, shard_id=rank_id, shard_equal_rows=True)
else: else:
ds = de.TFRecordDataset(dataset_files=dataset_files,
shuffle=shuffle, schema=schema, num_parallel_workers=8)
ds = ds.batch(int(batch_size / line_per_sample),
drop_remainder=True)
data_set = ds.TFRecordDataset(dataset_files=dataset_files,
shuffle=shuffle, schema=schema, num_parallel_workers=8)
data_set = data_set.batch(int(batch_size / line_per_sample),
drop_remainder=True)


ds = ds.map(operations=_padding_func(batch_size, manual_shape, target_column),
input_columns=['feat_ids', 'feat_vals', 'label'],
column_order=['feat_ids', 'feat_vals', 'label'], num_parallel_workers=8)
ds = ds.repeat(epochs)
return ds
data_set = data_set.map(operations=_padding_func(batch_size, manual_shape, target_column),
input_columns=['feat_ids', 'feat_vals', 'label'],
column_order=['feat_ids', 'feat_vals', 'label'], num_parallel_workers=8)
data_set = data_set.repeat(epochs)
return data_set




def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=1000, def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=1000,
@@ -257,21 +255,21 @@ def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=100
shuffle = train_mode shuffle = train_mode


if rank_size is not None and rank_id is not None: if rank_size is not None and rank_id is not None:
ds = de.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name),
columns_list=['feat_ids', 'feat_vals', 'label'],
num_shards=rank_size, shard_id=rank_id, shuffle=shuffle,
num_parallel_workers=8)
data_set = ds.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name),
columns_list=['feat_ids', 'feat_vals', 'label'],
num_shards=rank_size, shard_id=rank_id, shuffle=shuffle,
num_parallel_workers=8)
else: else:
ds = de.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name),
columns_list=['feat_ids', 'feat_vals', 'label'],
shuffle=shuffle, num_parallel_workers=8)
ds = ds.batch(int(batch_size / line_per_sample), drop_remainder=True)
ds = ds.map(_padding_func(batch_size, manual_shape, target_column),
input_columns=['feat_ids', 'feat_vals', 'label'],
column_order=['feat_ids', 'feat_vals', 'label'],
num_parallel_workers=8)
ds = ds.repeat(epochs)
return ds
data_set = ds.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name),
columns_list=['feat_ids', 'feat_vals', 'label'],
shuffle=shuffle, num_parallel_workers=8)
data_set = data_set.batch(int(batch_size / line_per_sample), drop_remainder=True)
data_set = data_set.map(_padding_func(batch_size, manual_shape, target_column),
input_columns=['feat_ids', 'feat_vals', 'label'],
column_order=['feat_ids', 'feat_vals', 'label'],
num_parallel_workers=8)
data_set = data_set.repeat(epochs)
return data_set




def _get_vocab_size(target_column_number, worker_size, total_vocab_size, multiply=False, per_vocab_size=None): def _get_vocab_size(target_column_number, worker_size, total_vocab_size, multiply=False, per_vocab_size=None):
@@ -284,7 +282,7 @@ def _get_vocab_size(target_column_number, worker_size, total_vocab_size, multipl
5, 21762, 14, 15, 15030, 61, 12220] 5, 21762, 14, 15, 15030, 61, 12220]


new_vocabs = inidival_vocabs + [1] * \ new_vocabs = inidival_vocabs + [1] * \
(target_column_number - len(inidival_vocabs))
(target_column_number - len(inidival_vocabs))
part_size = int(target_column_number / worker_size) part_size = int(target_column_number / worker_size)


# According to the workers, we merge some fields into the same part # According to the workers, we merge some fields into the same part
@@ -304,21 +302,21 @@ def _get_vocab_size(target_column_number, worker_size, total_vocab_size, multipl
# Expands the vocabulary of each field by the multiplier # Expands the vocabulary of each field by the multiplier
if multiply is True: if multiply is True:
cur_sum = sum(new_vocab_size) cur_sum = sum(new_vocab_size)
k = total_vocab_size/cur_sum
k = total_vocab_size / cur_sum
new_vocab_size = [ new_vocab_size = [
math.ceil(int(item*k)/worker_size)*worker_size for item in new_vocab_size]
new_vocab_size = [(item // 8 + 1)*8 for item in new_vocab_size]
math.ceil(int(item * k) / worker_size) * worker_size for item in new_vocab_size]
new_vocab_size = [(item // 8 + 1) * 8 for item in new_vocab_size]


else: else:
if total_vocab_size > sum(new_vocab_size): if total_vocab_size > sum(new_vocab_size):
new_vocab_size[-1] = total_vocab_size - \ new_vocab_size[-1] = total_vocab_size - \
sum(new_vocab_size[:-1])
sum(new_vocab_size[:-1])
new_vocab_size = [item for item in new_vocab_size] new_vocab_size = [item for item in new_vocab_size]
else: else:
raise ValueError( raise ValueError(
"Please providede the correct vocab size, now is {}".format(total_vocab_size)) "Please providede the correct vocab size, now is {}".format(total_vocab_size))


for i in range(worker_size-1):
for i in range(worker_size - 1):
off = index_offsets[i] + features[i] off = index_offsets[i] + features[i]
index_offsets.append(off) index_offsets.append(off)




+ 2
- 2
model_zoo/official/recommend/wide_and_deep/train_and_eval_auto_parallel.py View File

@@ -17,7 +17,7 @@


import os import os
import sys import sys
import mindspore.dataset.engine as de
import mindspore.dataset as ds
from mindspore import Model, context from mindspore import Model, context
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor
from mindspore.context import ParallelMode from mindspore.context import ParallelMode
@@ -88,7 +88,7 @@ def train_and_eval(config):
print("epochs is {}".format(epochs)) print("epochs is {}".format(epochs))
if config.full_batch: if config.full_batch:
context.set_auto_parallel_context(full_batch=True) context.set_auto_parallel_context(full_batch=True)
de.config.set_seed(1)
ds.config.set_seed(1)
if config.field_slice: if config.field_slice:
compute_manual_shape(config, get_group_size()) compute_manual_shape(config, get_group_size())
ds_train = create_dataset(data_path, train_mode=True, epochs=1, ds_train = create_dataset(data_path, train_mode=True, epochs=1,


+ 2
- 2
model_zoo/official/recommend/wide_and_deep/train_and_eval_parameter_server_distribute.py View File

@@ -17,7 +17,7 @@


import os import os
import sys import sys
import mindspore.dataset.engine as de
import mindspore.dataset as ds
from mindspore import Model, context from mindspore import Model, context
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor
from mindspore.context import ParallelMode from mindspore.context import ParallelMode
@@ -92,7 +92,7 @@ def train_and_eval(config):
print("epochs is {}".format(epochs)) print("epochs is {}".format(epochs))
if config.full_batch: if config.full_batch:
context.set_auto_parallel_context(full_batch=True) context.set_auto_parallel_context(full_batch=True)
de.config.set_seed(1)
ds.config.set_seed(1)
ds_train = create_dataset(data_path, train_mode=True, epochs=1, ds_train = create_dataset(data_path, train_mode=True, epochs=1,
batch_size=batch_size*get_group_size(), data_type=dataset_type) batch_size=batch_size*get_group_size(), data_type=dataset_type)
ds_eval = create_dataset(data_path, train_mode=False, epochs=1, ds_eval = create_dataset(data_path, train_mode=False, epochs=1,


+ 24
- 26
model_zoo/official/recommend/wide_and_deep_multitable/src/datasets.py View File

@@ -18,7 +18,7 @@ import math
import pickle import pickle
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.common.dtype as mstype import mindspore.common.dtype as mstype




@@ -97,8 +97,7 @@ class H5Dataset():
np.random.shuffle(sample_index) np.random.shuffle(sample_index)
assert X.shape[0] > 0 assert X.shape[0] > 0
while True: while True:
batch_index = sample_index[batch_size * counter:batch_size *
(counter + 1)]
batch_index = sample_index[batch_size * counter:batch_size * (counter + 1)]
X_batch = X[batch_index] X_batch = X[batch_index]
y_batch = y[batch_index] y_batch = y[batch_index]
counter += 1 counter += 1
@@ -135,9 +134,8 @@ class H5Dataset():
X, y, finished = data_gen.__next__() X, y, finished = data_gen.__next__()
X_id = X[:, 0:self.input_length] X_id = X[:, 0:self.input_length]
X_va = X[:, self.input_length:] X_va = X[:, self.input_length:]
yield np.array(X_id.astype(dtype=np.int32)), np.array(
X_va.astype(dtype=np.float32)), np.array(
y.astype(dtype=np.float32))
yield np.array(X_id.astype(dtype=np.int32)), np.array(X_va.astype(dtype=np.float32)), np.array(
y.astype(dtype=np.float32))




def _get_h5_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000): def _get_h5_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000):
@@ -159,10 +157,10 @@ def _get_h5_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000):
for _ in range(0, numbers_of_batch, 1): for _ in range(0, numbers_of_batch, 1):
yield train_eval_gen.__next__() yield train_eval_gen.__next__()


ds = de.GeneratorDataset(_iter_h5_data(),
["ids", "weights", "labels"])
ds = ds.repeat(epochs)
return ds
data_set = ds.GeneratorDataset(_iter_h5_data(),
["ids", "weights", "labels"])
data_set = data_set.repeat(epochs)
return data_set




def _get_tf_dataset(data_dir, def _get_tf_dataset(data_dir,
@@ -184,7 +182,7 @@ def _get_tf_dataset(data_dir,
for filename in filenames: for filename in filenames:
if file_prefix_name in filename and "tfrecord" in filename: if file_prefix_name in filename and "tfrecord" in filename:
dataset_files.append(os.path.join(dirpath, filename)) dataset_files.append(os.path.join(dirpath, filename))
schema = de.Schema()
schema = ds.Schema()


float_key_list = ["label", "continue_val"] float_key_list = ["label", "continue_val"]


@@ -199,19 +197,19 @@ def _get_tf_dataset(data_dir,
schema.add_column(key, de_type=ms_dtype) schema.add_column(key, de_type=ms_dtype)


if rank_size is not None and rank_id is not None: if rank_size is not None and rank_id is not None:
ds = de.TFRecordDataset(dataset_files=dataset_files,
shuffle=shuffle,
schema=schema,
num_parallel_workers=8,
num_shards=rank_size,
shard_id=rank_id,
shard_equal_rows=True)
data_set = ds.TFRecordDataset(dataset_files=dataset_files,
shuffle=shuffle,
schema=schema,
num_parallel_workers=8,
num_shards=rank_size,
shard_id=rank_id,
shard_equal_rows=True)
else: else:
ds = de.TFRecordDataset(dataset_files=dataset_files,
shuffle=shuffle,
schema=schema,
num_parallel_workers=8)
ds = ds.batch(int(batch_size / line_per_sample), drop_remainder=True)
data_set = ds.TFRecordDataset(dataset_files=dataset_files,
shuffle=shuffle,
schema=schema,
num_parallel_workers=8)
data_set = data_set.batch(int(batch_size / line_per_sample), drop_remainder=True)


operations_list = [] operations_list = []
for key in columns_list: for key in columns_list:
@@ -249,7 +247,7 @@ def _get_tf_dataset(data_dir,
u = np.array(u).flatten().reshape(batch_size, -1) u = np.array(u).flatten().reshape(batch_size, -1)
return a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u return a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u


ds = ds.map(
data_set = data_set.map(
operations=mixup, operations=mixup,
input_columns=[ input_columns=[
'label', 'continue_val', 'indicator_id', 'emb_128_id', 'label', 'continue_val', 'indicator_id', 'emb_128_id',
@@ -275,8 +273,8 @@ def _get_tf_dataset(data_dir,
], ],
num_parallel_workers=8) num_parallel_workers=8)


ds = ds.repeat(epochs)
return ds
data_set = data_set.repeat(epochs)
return data_set




def compute_emb_dim(config): def compute_emb_dim(config):


+ 34
- 37
model_zoo/research/cv/centernet/src/dataset.py View File

@@ -24,16 +24,17 @@ import cv2
import numpy as np import numpy as np
import pycocotools.coco as coco import pycocotools.coco as coco


import mindspore.dataset.engine.datasets as de
import mindspore.dataset as ds
from mindspore import log as logger from mindspore import log as logger
from mindspore.mindrecord import FileWriter from mindspore.mindrecord import FileWriter
from src.image import color_aug, get_affine_transform, affine_transform from src.image import color_aug, get_affine_transform, affine_transform
from src.image import gaussian_radius, draw_umich_gaussian, draw_msra_gaussian, draw_dense_reg from src.image import gaussian_radius, draw_umich_gaussian, draw_msra_gaussian, draw_dense_reg
from src.visual import visual_image from src.visual import visual_image

_current_dir = os.path.dirname(os.path.realpath(__file__)) _current_dir = os.path.dirname(os.path.realpath(__file__))




class COCOHP(de.Dataset):
class COCOHP(ds.Dataset):
""" """
Encapsulation class of COCO person keypoints datast. Encapsulation class of COCO person keypoints datast.
Initilize and preprocess of image for training and testing. Initilize and preprocess of image for training and testing.
@@ -47,6 +48,7 @@ class COCOHP(de.Dataset):
Returns: Returns:
Prepocessed training or testing dataset for CenterNet network. Prepocessed training or testing dataset for CenterNet network.
""" """

def __init__(self, data_opt, run_mode="train", net_opt=None, enable_visual_image=False, save_path=None): def __init__(self, data_opt, run_mode="train", net_opt=None, enable_visual_image=False, save_path=None):
super(COCOHP, self).__init__() super(COCOHP, self).__init__()
self._data_rng = np.random.RandomState(123) self._data_rng = np.random.RandomState(123)
@@ -64,7 +66,6 @@ class COCOHP(de.Dataset):
if not os.path.exists(self.save_path): if not os.path.exists(self.save_path):
os.makedirs(self.save_path) os.makedirs(self.save_path)



def init(self, data_dir, keep_res=False, flip_test=False): def init(self, data_dir, keep_res=False, flip_test=False):
"""initailize additional info""" """initailize additional info"""
logger.info('Initializing coco 2017 {} data.'.format(self.run_mode)) logger.info('Initializing coco 2017 {} data.'.format(self.run_mode))
@@ -124,7 +125,7 @@ class COCOHP(de.Dataset):
for img_id in self.images: for img_id in self.images:
image_info = self.coco.loadImgs([img_id]) image_info = self.coco.loadImgs([img_id])
annos = self.coco.loadAnns(self.anns[img_id]) annos = self.coco.loadAnns(self.anns[img_id])
#get image
# get image
img_name = image_info[0]['file_name'] img_name = image_info[0]['file_name']
img_name = os.path.join(self.image_path, img_name) img_name = os.path.join(self.image_path, img_name)
with open(img_name, 'rb') as f: with open(img_name, 'rb') as f:
@@ -147,19 +148,16 @@ class COCOHP(de.Dataset):
writer.commit() writer.commit()
logger.info("Create Mindrecord Done, at {}".format(mindrecord_dir)) logger.info("Create Mindrecord Done, at {}".format(mindrecord_dir))



def _coco_box_to_bbox(self, box): def _coco_box_to_bbox(self, box):
bbox = np.array([box[0], box[1], box[0] + box[2], box[1] + box[3]], dtype=np.float32) bbox = np.array([box[0], box[1], box[0] + box[2], box[1] + box[3]], dtype=np.float32)
return bbox return bbox



def _get_border(self, border, size): def _get_border(self, border, size):
i = 1 i = 1
while size - border // i <= border // i: while size - border // i <= border // i:
i *= 2 i *= 2
return border // i return border // i



def __getitem__(self, index): def __getitem__(self, index):
img_id = self.images[index] img_id = self.images[index]
file_name = self.coco.loadImgs(ids=[img_id])[0]['file_name'] file_name = self.coco.loadImgs(ids=[img_id])[0]['file_name']
@@ -169,7 +167,6 @@ class COCOHP(de.Dataset):
ret = (img, image_id) ret = (img, image_id)
return ret return ret



def pre_process_for_test(self, image, img_id, scale, meta=None): def pre_process_for_test(self, image, img_id, scale, meta=None):
"""image pre-process for evaluation""" """image pre-process for evaluation"""
b, h, w, ch = image.shape b, h, w, ch = image.shape
@@ -249,7 +246,6 @@ class COCOHP(de.Dataset):


return images, meta return images, meta



def preprocess_fn(self, img, num_objects, keypoints, bboxes, category_id): def preprocess_fn(self, img, num_objects, keypoints, bboxes, category_id):
"""image pre-process and augmentation""" """image pre-process and augmentation"""
num_objs = min(num_objects, self.data_opt.max_objs) num_objs = min(num_objects, self.data_opt.max_objs)
@@ -269,12 +265,12 @@ class COCOHP(de.Dataset):
else: else:
sf = self.data_opt.scale sf = self.data_opt.scale
cf = self.data_opt.shift cf = self.data_opt.shift
c[0] += s * np.clip(np.random.randn()*cf, -2*cf, 2*cf)
c[1] += s * np.clip(np.random.randn()*cf, -2*cf, 2*cf)
s = s * np.clip(np.random.randn()*sf + 1, 1 - sf, 1 + sf)
c[0] += s * np.clip(np.random.randn() * cf, -2 * cf, 2 * cf)
c[1] += s * np.clip(np.random.randn() * cf, -2 * cf, 2 * cf)
s = s * np.clip(np.random.randn() * sf + 1, 1 - sf, 1 + sf)
if np.random.random() < self.data_opt.aug_rot: if np.random.random() < self.data_opt.aug_rot:
rf = self.data_opt.rotate rf = self.data_opt.rotate
rot = np.clip(np.random.randn()*rf, -rf*2, rf*2)
rot = np.clip(np.random.randn() * rf, -rf * 2, rf * 2)


if np.random.random() < self.data_opt.flip_prop: if np.random.random() < self.data_opt.flip_prop:
flipped = True flipped = True
@@ -323,7 +319,7 @@ class COCOHP(de.Dataset):
cls_id = int(category_id[k]) - 1 cls_id = int(category_id[k]) - 1
pts = np.array(keypoints[k], np.float32).reshape(num_joints, 3) pts = np.array(keypoints[k], np.float32).reshape(num_joints, 3)
if flipped: if flipped:
bbox[[0, 2]] = width - bbox[[2, 0]] - 1 # index begin from zero
bbox[[0, 2]] = width - bbox[[2, 0]] - 1 # index begin from zero
pts[:, 0] = width - pts[:, 0] - 1 pts[:, 0] = width - pts[:, 0] - 1
for e in self.data_opt.flip_idx: for e in self.data_opt.flip_idx:
pts[e[0]], pts[e[1]] = pts[e[1]].copy(), pts[e[0]].copy() pts[e[0]], pts[e[1]] = pts[e[1]].copy(), pts[e[0]].copy()
@@ -360,7 +356,7 @@ class COCOHP(de.Dataset):
if pts[j, 2] > 0: if pts[j, 2] > 0:
pts[j, :2] = affine_transform(pts[j, :2], trans_output_rot) pts[j, :2] = affine_transform(pts[j, :2], trans_output_rot)
if pts[j, 0] >= 0 and pts[j, 0] < output_res and \ if pts[j, 0] >= 0 and pts[j, 0] < output_res and \
pts[j, 1] >= 0 and pts[j, 1] < output_res:
pts[j, 1] >= 0 and pts[j, 1] < output_res:
kps[k, j * 2: j * 2 + 2] = pts[j, :2] - ct_int kps[k, j * 2: j * 2 + 2] = pts[j, :2] - ct_int
kps_mask[k, j * 2: j * 2 + 2] = 1 kps_mask[k, j * 2: j * 2 + 2] = 1
pt_int = pts[j, :2].astype(np.int32) pt_int = pts[j, :2].astype(np.int32)
@@ -399,7 +395,6 @@ class COCOHP(de.Dataset):
visual_image(out_img, ground_truth, self.save_path, ratio=self.data_opt.input_res[0] // output_res) visual_image(out_img, ground_truth, self.save_path, ratio=self.data_opt.input_res[0] // output_res)
return ret return ret



def create_train_dataset(self, mindrecord_dir, prefix="coco_hp.train.mind", batch_size=1, def create_train_dataset(self, mindrecord_dir, prefix="coco_hp.train.mind", batch_size=1,
device_num=1, rank=0, num_parallel_workers=1, do_shuffle=True): device_num=1, rank=0, num_parallel_workers=1, do_shuffle=True):
"""create train dataset based on mindrecord file""" """create train dataset based on mindrecord file"""
@@ -415,41 +410,43 @@ class COCOHP(de.Dataset):
raise ValueError('data_dir {} have no data files'.format(mindrecord_dir)) raise ValueError('data_dir {} have no data files'.format(mindrecord_dir))


columns = ["image", "num_objects", "keypoints", "bbox", "category_id"] columns = ["image", "num_objects", "keypoints", "bbox", "category_id"]
ds = de.MindDataset(data_files,
columns_list=columns,
num_parallel_workers=num_parallel_workers, shuffle=do_shuffle,
num_shards=device_num, shard_id=rank)
ori_dataset_size = ds.get_dataset_size()
data_set = ds.MindDataset(data_files,
columns_list=columns,
num_parallel_workers=num_parallel_workers, shuffle=do_shuffle,
num_shards=device_num, shard_id=rank)
ori_dataset_size = data_set.get_dataset_size()
logger.info('origin dataset size: {}'.format(ori_dataset_size)) logger.info('origin dataset size: {}'.format(ori_dataset_size))


ds = ds.map(operations=self.preprocess_fn,
input_columns=["image", "num_objects", "keypoints", "bbox", "category_id"],
output_columns=["image", "hm", "reg_mask", "ind", "wh", "kps", "kps_mask",
"reg", "hm_hp", "hp_offset", "hp_ind", "hp_mask"],
column_order=["image", "hm", "reg_mask", "ind", "wh", "kps", "kps_mask",
"reg", "hm_hp", "hp_offset", "hp_ind", "hp_mask"],
num_parallel_workers=num_parallel_workers,
python_multiprocessing=True)
ds = ds.batch(batch_size, drop_remainder=True, num_parallel_workers=8)
logger.info("data size: {}".format(ds.get_dataset_size()))
logger.info("repeat count: {}".format(ds.get_repeat_count()))
return ds

data_set = data_set.map(operations=self.preprocess_fn,
input_columns=["image", "num_objects", "keypoints", "bbox", "category_id"],
output_columns=["image", "hm", "reg_mask", "ind", "wh", "kps", "kps_mask",
"reg", "hm_hp", "hp_offset", "hp_ind", "hp_mask"],
column_order=["image", "hm", "reg_mask", "ind", "wh", "kps", "kps_mask",
"reg", "hm_hp", "hp_offset", "hp_ind", "hp_mask"],
num_parallel_workers=num_parallel_workers,
python_multiprocessing=True)
data_set = data_set.batch(batch_size, drop_remainder=True, num_parallel_workers=8)
logger.info("data size: {}".format(data_set.get_dataset_size()))
logger.info("repeat count: {}".format(data_set.get_repeat_count()))
return data_set


def create_eval_dataset(self, batch_size=1, num_parallel_workers=1): def create_eval_dataset(self, batch_size=1, num_parallel_workers=1):
"""create testing dataset based on coco format""" """create testing dataset based on coco format"""

def generator(): def generator():
for i in range(self.num_samples): for i in range(self.num_samples):
yield self.__getitem__(i) yield self.__getitem__(i)

column = ["image", "image_id"] column = ["image", "image_id"]
ds = de.GeneratorDataset(generator, column, num_parallel_workers=num_parallel_workers)
ds = ds.batch(batch_size, drop_remainder=True, num_parallel_workers=8)
return ds
data_set = ds.GeneratorDataset(generator, column, num_parallel_workers=num_parallel_workers)
data_set = data_set.batch(batch_size, drop_remainder=True, num_parallel_workers=8)
return data_set




if __name__ == '__main__': if __name__ == '__main__':
# Convert coco2017 dataset to mindrecord to improve performance on host # Convert coco2017 dataset to mindrecord to improve performance on host
from src.config import dataset_config from src.config import dataset_config

parser = argparse.ArgumentParser(description='CenterNet MindRecord dataset') parser = argparse.ArgumentParser(description='CenterNet MindRecord dataset')
parser.add_argument("--coco_data_dir", type=str, default="", help="Coco dataset directory.") parser.add_argument("--coco_data_dir", type=str, default="", help="Coco dataset directory.")
parser.add_argument("--mindrecord_dir", type=str, default="", help="MindRecord dataset dir.") parser.add_argument("--mindrecord_dir", type=str, default="", help="MindRecord dataset dir.")


+ 16
- 16
model_zoo/research/cv/ghostnet/src/dataset.py View File

@@ -17,7 +17,7 @@ create train or eval dataset.
""" """
import os import os
import mindspore.common.dtype as mstype import mindspore.common.dtype as mstype
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.transforms.vision.c_transforms as C import mindspore.dataset.transforms.vision.c_transforms as C
import mindspore.dataset.transforms.vision.py_transforms as P import mindspore.dataset.transforms.vision.py_transforms as P
import mindspore.dataset.transforms.c_transforms as C2 import mindspore.dataset.transforms.c_transforms as C2
@@ -41,18 +41,18 @@ def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch
rank_size = int(os.getenv("RANK_SIZE")) rank_size = int(os.getenv("RANK_SIZE"))
rank_id = int(os.getenv("RANK_ID")) rank_id = int(os.getenv("RANK_ID"))
if rank_size == 1: if rank_size == 1:
ds = de.MindDataset(
data_set = ds.MindDataset(
dataset_path, num_parallel_workers=8, shuffle=True) dataset_path, num_parallel_workers=8, shuffle=True)
else: else:
ds = de.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=rank_size, shard_id=rank_id)
data_set = ds.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=rank_size, shard_id=rank_id)
elif platform == "GPU": elif platform == "GPU":
if do_train: if do_train:
from mindspore.communication.management import get_rank, get_group_size from mindspore.communication.management import get_rank, get_group_size
ds = de.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=get_group_size(), shard_id=get_rank())
data_set = ds.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=get_group_size(), shard_id=get_rank())
else: else:
ds = de.MindDataset(
data_set = ds.MindDataset(
dataset_path, num_parallel_workers=8, shuffle=True) dataset_path, num_parallel_workers=8, shuffle=True)
else: else:
raise ValueError("Unsupport platform.") raise ValueError("Unsupport platform.")
@@ -67,7 +67,7 @@ def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch


color_op = C.RandomColorAdjust( color_op = C.RandomColorAdjust(
brightness=0.4, contrast=0.4, saturation=0.4) brightness=0.4, contrast=0.4, saturation=0.4)
rescale_op = C.Rescale(1/255.0, 0)
rescale_op = C.Rescale(1 / 255.0, 0)
normalize_op = C.Normalize( normalize_op = C.Normalize(
mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
change_swap_op = C.HWC2CHW() change_swap_op = C.HWC2CHW()
@@ -93,18 +93,18 @@ def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch
trans = composeop() trans = composeop()
type_cast_op = C2.TypeCast(mstype.int32) type_cast_op = C2.TypeCast(mstype.int32)


ds = ds.map(input_columns="image", operations=trans,
num_parallel_workers=8)
ds = ds.map(input_columns="label_list",
operations=type_cast_op, num_parallel_workers=8)
data_set = data_set.map(input_columns="image", operations=trans,
num_parallel_workers=8)
data_set = data_set.map(input_columns="label_list",
operations=type_cast_op, num_parallel_workers=8)


# apply shuffle operations # apply shuffle operations
ds = ds.shuffle(buffer_size=buffer_size)
data_set = data_set.shuffle(buffer_size=buffer_size)


# apply batch operations # apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
data_set = data_set.batch(batch_size, drop_remainder=True)


# apply dataset repeat operation # apply dataset repeat operation
ds = ds.repeat(repeat_num)
data_set = data_set.repeat(repeat_num)


return ds
return data_set

+ 16
- 16
model_zoo/research/cv/ghostnet_quant/src/dataset.py View File

@@ -17,7 +17,7 @@ create train or eval dataset.
""" """
import os import os
import mindspore.common.dtype as mstype import mindspore.common.dtype as mstype
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.transforms.vision.c_transforms as C import mindspore.dataset.transforms.vision.c_transforms as C
import mindspore.dataset.transforms.vision.py_transforms as P import mindspore.dataset.transforms.vision.py_transforms as P
import mindspore.dataset.transforms.c_transforms as C2 import mindspore.dataset.transforms.c_transforms as C2
@@ -41,18 +41,18 @@ def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch
rank_size = int(os.getenv("RANK_SIZE")) rank_size = int(os.getenv("RANK_SIZE"))
rank_id = int(os.getenv("RANK_ID")) rank_id = int(os.getenv("RANK_ID"))
if rank_size == 1: if rank_size == 1:
ds = de.MindDataset(
data_set = ds.MindDataset(
dataset_path, num_parallel_workers=8, shuffle=True) dataset_path, num_parallel_workers=8, shuffle=True)
else: else:
ds = de.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=rank_size, shard_id=rank_id)
data_set = ds.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=rank_size, shard_id=rank_id)
elif platform == "GPU": elif platform == "GPU":
if do_train: if do_train:
from mindspore.communication.management import get_rank, get_group_size from mindspore.communication.management import get_rank, get_group_size
ds = de.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=get_group_size(), shard_id=get_rank())
data_set = ds.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=get_group_size(), shard_id=get_rank())
else: else:
ds = de.MindDataset(
data_set = ds.MindDataset(
dataset_path, num_parallel_workers=8, shuffle=True) dataset_path, num_parallel_workers=8, shuffle=True)
else: else:
raise ValueError("Unsupport platform.") raise ValueError("Unsupport platform.")
@@ -67,7 +67,7 @@ def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch


color_op = C.RandomColorAdjust( color_op = C.RandomColorAdjust(
brightness=0.4, contrast=0.4, saturation=0.4) brightness=0.4, contrast=0.4, saturation=0.4)
rescale_op = C.Rescale(1/255.0, 0)
rescale_op = C.Rescale(1 / 255.0, 0)
normalize_op = C.Normalize( normalize_op = C.Normalize(
mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
change_swap_op = C.HWC2CHW() change_swap_op = C.HWC2CHW()
@@ -93,18 +93,18 @@ def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch
trans = composeop() trans = composeop()
type_cast_op = C2.TypeCast(mstype.int32) type_cast_op = C2.TypeCast(mstype.int32)


ds = ds.map(input_columns="image", operations=trans,
num_parallel_workers=8)
ds = ds.map(input_columns="label_list",
operations=type_cast_op, num_parallel_workers=8)
data_set = data_set.map(input_columns="image", operations=trans,
num_parallel_workers=8)
data_set = data_set.map(input_columns="label_list",
operations=type_cast_op, num_parallel_workers=8)


# apply shuffle operations # apply shuffle operations
ds = ds.shuffle(buffer_size=buffer_size)
data_set = data_set.shuffle(buffer_size=buffer_size)


# apply batch operations # apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
data_set = data_set.batch(batch_size, drop_remainder=True)


# apply dataset repeat operation # apply dataset repeat operation
ds = ds.repeat(repeat_num)
data_set = data_set.repeat(repeat_num)


return ds
return data_set

+ 16
- 16
model_zoo/research/cv/resnet50_adv_pruning/src/pet_dataset.py View File

@@ -17,7 +17,7 @@ create train or eval dataset.
""" """
import os import os
import mindspore.common.dtype as mstype import mindspore.common.dtype as mstype
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.vision.c_transforms as C import mindspore.dataset.vision.c_transforms as C
import mindspore.dataset.vision.py_transforms as P import mindspore.dataset.vision.py_transforms as P
import mindspore.dataset.transforms.c_transforms as C2 import mindspore.dataset.transforms.c_transforms as C2
@@ -42,18 +42,18 @@ def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch
rank_size = int(os.getenv("RANK_SIZE")) rank_size = int(os.getenv("RANK_SIZE"))
rank_id = int(os.getenv("RANK_ID")) rank_id = int(os.getenv("RANK_ID"))
if rank_size == 1: if rank_size == 1:
ds = de.MindDataset(
data_set = ds.MindDataset(
dataset_path, num_parallel_workers=8, shuffle=True) dataset_path, num_parallel_workers=8, shuffle=True)
else: else:
ds = de.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=rank_size, shard_id=rank_id)
data_set = ds.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=rank_size, shard_id=rank_id)
elif platform == "GPU": elif platform == "GPU":
if do_train: if do_train:
from mindspore.communication.management import get_rank, get_group_size from mindspore.communication.management import get_rank, get_group_size
ds = de.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=get_group_size(), shard_id=get_rank())
data_set = ds.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=get_group_size(), shard_id=get_rank())
else: else:
ds = de.MindDataset(
data_set = ds.MindDataset(
dataset_path, num_parallel_workers=8, shuffle=False) dataset_path, num_parallel_workers=8, shuffle=False)
else: else:
raise ValueError("Unsupport platform.") raise ValueError("Unsupport platform.")
@@ -68,7 +68,7 @@ def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch


color_op = C.RandomColorAdjust( color_op = C.RandomColorAdjust(
brightness=0.4, contrast=0.4, saturation=0.4) brightness=0.4, contrast=0.4, saturation=0.4)
rescale_op = C.Rescale(1/255.0, 0)
rescale_op = C.Rescale(1 / 255.0, 0)
normalize_op = C.Normalize( normalize_op = C.Normalize(
mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
change_swap_op = C.HWC2CHW() change_swap_op = C.HWC2CHW()
@@ -88,18 +88,18 @@ def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch
trans = composeop trans = composeop
type_cast_op = C2.TypeCast(mstype.int32) type_cast_op = C2.TypeCast(mstype.int32)


ds = ds.map(input_columns="image", operations=trans,
num_parallel_workers=8)
ds = ds.map(input_columns="label_list",
operations=type_cast_op, num_parallel_workers=8)
data_set = data_set.map(input_columns="image", operations=trans,
num_parallel_workers=8)
data_set = data_set.map(input_columns="label_list",
operations=type_cast_op, num_parallel_workers=8)


# apply shuffle operations # apply shuffle operations
ds = ds.shuffle(buffer_size=buffer_size)
data_set = data_set.shuffle(buffer_size=buffer_size)


# apply batch operations # apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
data_set = data_set.batch(batch_size, drop_remainder=True)


# apply dataset repeat operation # apply dataset repeat operation
ds = ds.repeat(repeat_num)
data_set = data_set.repeat(repeat_num)


return ds
return data_set

+ 35
- 35
model_zoo/research/cv/squeezenet/src/dataset.py View File

@@ -17,7 +17,7 @@ create train or eval dataset.
""" """
import os import os
import mindspore.common.dtype as mstype import mindspore.common.dtype as mstype
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.vision.c_transforms as C import mindspore.dataset.vision.c_transforms as C
import mindspore.dataset.transforms.c_transforms as C2 import mindspore.dataset.transforms.c_transforms as C2
from mindspore.communication.management import init, get_rank, get_group_size from mindspore.communication.management import init, get_rank, get_group_size
@@ -48,15 +48,15 @@ def create_dataset_cifar(dataset_path,
device_num = get_group_size() device_num = get_group_size()


if device_num == 1: if device_num == 1:
ds = de.Cifar10Dataset(dataset_path,
num_parallel_workers=8,
shuffle=True)
data_set = ds.Cifar10Dataset(dataset_path,
num_parallel_workers=8,
shuffle=True)
else: else:
ds = de.Cifar10Dataset(dataset_path,
num_parallel_workers=8,
shuffle=True,
num_shards=device_num,
shard_id=rank_id)
data_set = ds.Cifar10Dataset(dataset_path,
num_parallel_workers=8,
shuffle=True,
num_shards=device_num,
shard_id=rank_id)


# define map operations # define map operations
if do_train: if do_train:
@@ -80,20 +80,20 @@ def create_dataset_cifar(dataset_path,


type_cast_op = C2.TypeCast(mstype.int32) type_cast_op = C2.TypeCast(mstype.int32)


ds = ds.map(operations=type_cast_op,
input_columns="label",
num_parallel_workers=8)
ds = ds.map(operations=trans,
input_columns="image",
num_parallel_workers=8)
data_set = data_set.map(operations=type_cast_op,
input_columns="label",
num_parallel_workers=8)
data_set = data_set.map(operations=trans,
input_columns="image",
num_parallel_workers=8)


# apply batch operations # apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
data_set = data_set.batch(batch_size, drop_remainder=True)


# apply dataset repeat operation # apply dataset repeat operation
ds = ds.repeat(repeat_num)
data_set = data_set.repeat(repeat_num)


return ds
return data_set




def create_dataset_imagenet(dataset_path, def create_dataset_imagenet(dataset_path,
@@ -122,15 +122,15 @@ def create_dataset_imagenet(dataset_path,
device_num = get_group_size() device_num = get_group_size()


if device_num == 1: if device_num == 1:
ds = de.ImageFolderDataset(dataset_path,
num_parallel_workers=8,
shuffle=True)
data_set = ds.ImageFolderDataset(dataset_path,
num_parallel_workers=8,
shuffle=True)
else: else:
ds = de.ImageFolderDataset(dataset_path,
num_parallel_workers=8,
shuffle=True,
num_shards=device_num,
shard_id=rank_id)
data_set = ds.ImageFolderDataset(dataset_path,
num_parallel_workers=8,
shuffle=True,
num_shards=device_num,
shard_id=rank_id)


image_size = 227 image_size = 227
mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] mean = [0.485 * 255, 0.456 * 255, 0.406 * 255]
@@ -159,20 +159,20 @@ def create_dataset_imagenet(dataset_path,


type_cast_op = C2.TypeCast(mstype.int32) type_cast_op = C2.TypeCast(mstype.int32)


ds = ds.map(operations=type_cast_op,
input_columns="label",
num_parallel_workers=8)
ds = ds.map(operations=trans,
input_columns="image",
num_parallel_workers=8)
data_set = data_set.map(operations=type_cast_op,
input_columns="label",
num_parallel_workers=8)
data_set = data_set.map(operations=trans,
input_columns="image",
num_parallel_workers=8)


# apply batch operations # apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
data_set = data_set.batch(batch_size, drop_remainder=True)


# apply dataset repeat operation # apply dataset repeat operation
ds = ds.repeat(repeat_num)
data_set = data_set.repeat(repeat_num)


return ds
return data_set




def _get_rank_info(): def _get_rank_info():


+ 36
- 36
model_zoo/research/recommend/autodis/src/dataset.py View File

@@ -21,7 +21,7 @@ from enum import Enum


import numpy as np import numpy as np
import pandas as pd import pandas as pd
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.common.dtype as mstype import mindspore.common.dtype as mstype


from .config import DataConfig from .config import DataConfig
@@ -142,8 +142,8 @@ class H5Dataset():
X_id = X[:, 0:self.max_length] X_id = X[:, 0:self.max_length]
X_va = X[:, self.max_length:] X_va = X[:, self.max_length:]
yield np.array(X_id.astype(dtype=np.int32)), \ yield np.array(X_id.astype(dtype=np.int32)), \
np.array(X_va.astype(dtype=np.float32)), \
np.array(y.astype(dtype=np.float32))
np.array(X_va.astype(dtype=np.float32)), \
np.array(y.astype(dtype=np.float32))




def _get_h5_dataset(directory, train_mode=True, epochs=1, batch_size=1000): def _get_h5_dataset(directory, train_mode=True, epochs=1, batch_size=1000):
@@ -172,9 +172,9 @@ def _get_h5_dataset(directory, train_mode=True, epochs=1, batch_size=1000):
for _ in range(0, numbers_of_batch, 1): for _ in range(0, numbers_of_batch, 1):
yield train_eval_gen.__next__() yield train_eval_gen.__next__()


ds = de.GeneratorDataset(_iter_h5_data, ["ids", "weights", "labels"])
ds = ds.repeat(epochs)
return ds
data_set = ds.GeneratorDataset(_iter_h5_data, ["ids", "weights", "labels"])
data_set = data_set.repeat(epochs)
return data_set




def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=1000, def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=1000,
@@ -199,23 +199,23 @@ def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=100
shuffle = train_mode shuffle = train_mode


if rank_size is not None and rank_id is not None: if rank_size is not None and rank_id is not None:
ds = de.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name),
columns_list=['feat_ids', 'feat_vals', 'label'],
num_shards=rank_size, shard_id=rank_id, shuffle=shuffle,
num_parallel_workers=8)
data_set = ds.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name),
columns_list=['feat_ids', 'feat_vals', 'label'],
num_shards=rank_size, shard_id=rank_id, shuffle=shuffle,
num_parallel_workers=8)
else: else:
ds = de.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name),
columns_list=['feat_ids', 'feat_vals', 'label'],
shuffle=shuffle, num_parallel_workers=8)
ds = ds.batch(int(batch_size / line_per_sample), drop_remainder=True)
ds = ds.map(operations=(lambda x, y, z: (np.array(x).flatten().reshape(batch_size, 39),
np.array(y).flatten().reshape(batch_size, 39),
np.array(z).flatten().reshape(batch_size, 1))),
input_columns=['feat_ids', 'feat_vals', 'label'],
column_order=['feat_ids', 'feat_vals', 'label'],
num_parallel_workers=8)
ds = ds.repeat(epochs)
return ds
data_set = ds.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name),
columns_list=['feat_ids', 'feat_vals', 'label'],
shuffle=shuffle, num_parallel_workers=8)
data_set = data_set.batch(int(batch_size / line_per_sample), drop_remainder=True)
data_set = data_set.map(operations=(lambda x, y, z: (np.array(x).flatten().reshape(batch_size, 39),
np.array(y).flatten().reshape(batch_size, 39),
np.array(z).flatten().reshape(batch_size, 1))),
input_columns=['feat_ids', 'feat_vals', 'label'],
column_order=['feat_ids', 'feat_vals', 'label'],
num_parallel_workers=8)
data_set = data_set.repeat(epochs)
return data_set




def _get_tf_dataset(directory, train_mode=True, epochs=1, batch_size=1000, def _get_tf_dataset(directory, train_mode=True, epochs=1, batch_size=1000,
@@ -242,28 +242,28 @@ def _get_tf_dataset(directory, train_mode=True, epochs=1, batch_size=1000,
for filename in filenames: for filename in filenames:
if file_prefixt_name in filename and 'tfrecord' in filename: if file_prefixt_name in filename and 'tfrecord' in filename:
dataset_files.append(os.path.join(dir_path, filename)) dataset_files.append(os.path.join(dir_path, filename))
schema = de.Schema()
schema = ds.Schema()
schema.add_column('feat_ids', de_type=mstype.int32) schema.add_column('feat_ids', de_type=mstype.int32)
schema.add_column('feat_vals', de_type=mstype.float32) schema.add_column('feat_vals', de_type=mstype.float32)
schema.add_column('label', de_type=mstype.float32) schema.add_column('label', de_type=mstype.float32)
if rank_size is not None and rank_id is not None: if rank_size is not None and rank_id is not None:
ds = de.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle,
schema=schema, num_parallel_workers=8,
num_shards=rank_size, shard_id=rank_id,
shard_equal_rows=True)
data_set = ds.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle,
schema=schema, num_parallel_workers=8,
num_shards=rank_size, shard_id=rank_id,
shard_equal_rows=True)
else: else:
ds = de.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle,
schema=schema, num_parallel_workers=8)
ds = ds.batch(int(batch_size / line_per_sample), drop_remainder=True)
ds = ds.map(operations=(lambda x, y, z: (
data_set = ds.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle,
schema=schema, num_parallel_workers=8)
data_set = data_set.batch(int(batch_size / line_per_sample), drop_remainder=True)
data_set = data_set.map(operations=(lambda x, y, z: (
np.array(x).flatten().reshape(batch_size, 39), np.array(x).flatten().reshape(batch_size, 39),
np.array(y).flatten().reshape(batch_size, 39), np.array(y).flatten().reshape(batch_size, 39),
np.array(z).flatten().reshape(batch_size, 1))), np.array(z).flatten().reshape(batch_size, 1))),
input_columns=['feat_ids', 'feat_vals', 'label'],
column_order=['feat_ids', 'feat_vals', 'label'],
num_parallel_workers=8)
ds = ds.repeat(epochs)
return ds
input_columns=['feat_ids', 'feat_vals', 'label'],
column_order=['feat_ids', 'feat_vals', 'label'],
num_parallel_workers=8)
data_set = data_set.repeat(epochs)
return data_set




def create_dataset(directory, train_mode=True, epochs=1, batch_size=1000, def create_dataset(directory, train_mode=True, epochs=1, batch_size=1000,


+ 36
- 36
tests/st/model_zoo_tests/DeepFM/src/dataset.py View File

@@ -21,7 +21,7 @@ from enum import Enum


import pandas as pd import pandas as pd
import numpy as np import numpy as np
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.common.dtype as mstype import mindspore.common.dtype as mstype


from .config import DataConfig from .config import DataConfig
@@ -142,8 +142,8 @@ class H5Dataset():
X_id = X[:, 0:self.max_length] X_id = X[:, 0:self.max_length]
X_va = X[:, self.max_length:] X_va = X[:, self.max_length:]
yield np.array(X_id.astype(dtype=np.int32)), \ yield np.array(X_id.astype(dtype=np.int32)), \
np.array(X_va.astype(dtype=np.float32)), \
np.array(y.astype(dtype=np.float32))
np.array(X_va.astype(dtype=np.float32)), \
np.array(y.astype(dtype=np.float32))




def _get_h5_dataset(directory, train_mode=True, epochs=1, batch_size=1000): def _get_h5_dataset(directory, train_mode=True, epochs=1, batch_size=1000):
@@ -172,9 +172,9 @@ def _get_h5_dataset(directory, train_mode=True, epochs=1, batch_size=1000):
for _ in range(0, numbers_of_batch, 1): for _ in range(0, numbers_of_batch, 1):
yield train_eval_gen.__next__() yield train_eval_gen.__next__()


ds = de.GeneratorDataset(_iter_h5_data, ["ids", "weights", "labels"], num_samples=3000)
ds = ds.repeat(epochs)
return ds
data_set = ds.GeneratorDataset(_iter_h5_data, ["ids", "weights", "labels"], num_samples=3000)
data_set = data_set.repeat(epochs)
return data_set




def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=1000, def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=1000,
@@ -199,23 +199,23 @@ def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=100
shuffle = train_mode shuffle = train_mode


if rank_size is not None and rank_id is not None: if rank_size is not None and rank_id is not None:
ds = de.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name),
columns_list=['feat_ids', 'feat_vals', 'label'],
num_shards=rank_size, shard_id=rank_id, shuffle=shuffle,
num_parallel_workers=8)
data_set = ds.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name),
columns_list=['feat_ids', 'feat_vals', 'label'],
num_shards=rank_size, shard_id=rank_id, shuffle=shuffle,
num_parallel_workers=8)
else: else:
ds = de.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name),
columns_list=['feat_ids', 'feat_vals', 'label'],
shuffle=shuffle, num_parallel_workers=8)
ds = ds.batch(int(batch_size / line_per_sample), drop_remainder=True)
ds = ds.map(operations=(lambda x, y, z: (np.array(x).flatten().reshape(batch_size, 39),
np.array(y).flatten().reshape(batch_size, 39),
np.array(z).flatten().reshape(batch_size, 1))),
input_columns=['feat_ids', 'feat_vals', 'label'],
column_order=['feat_ids', 'feat_vals', 'label'],
num_parallel_workers=8)
ds = ds.repeat(epochs)
return ds
data_set = ds.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name),
columns_list=['feat_ids', 'feat_vals', 'label'],
shuffle=shuffle, num_parallel_workers=8)
data_set = data_set.batch(int(batch_size / line_per_sample), drop_remainder=True)
data_set = data_set.map(operations=(lambda x, y, z: (np.array(x).flatten().reshape(batch_size, 39),
np.array(y).flatten().reshape(batch_size, 39),
np.array(z).flatten().reshape(batch_size, 1))),
input_columns=['feat_ids', 'feat_vals', 'label'],
column_order=['feat_ids', 'feat_vals', 'label'],
num_parallel_workers=8)
data_set = data_set.repeat(epochs)
return data_set




def _get_tf_dataset(directory, train_mode=True, epochs=1, batch_size=1000, def _get_tf_dataset(directory, train_mode=True, epochs=1, batch_size=1000,
@@ -242,28 +242,28 @@ def _get_tf_dataset(directory, train_mode=True, epochs=1, batch_size=1000,
for filename in filenames: for filename in filenames:
if file_prefixt_name in filename and 'tfrecord' in filename: if file_prefixt_name in filename and 'tfrecord' in filename:
dataset_files.append(os.path.join(dir_path, filename)) dataset_files.append(os.path.join(dir_path, filename))
schema = de.Schema()
schema = ds.Schema()
schema.add_column('feat_ids', de_type=mstype.int32) schema.add_column('feat_ids', de_type=mstype.int32)
schema.add_column('feat_vals', de_type=mstype.float32) schema.add_column('feat_vals', de_type=mstype.float32)
schema.add_column('label', de_type=mstype.float32) schema.add_column('label', de_type=mstype.float32)
if rank_size is not None and rank_id is not None: if rank_size is not None and rank_id is not None:
ds = de.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle,
schema=schema, num_parallel_workers=8,
num_shards=rank_size, shard_id=rank_id,
shard_equal_rows=True, num_samples=3000)
data_set = ds.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle,
schema=schema, num_parallel_workers=8,
num_shards=rank_size, shard_id=rank_id,
shard_equal_rows=True, num_samples=3000)
else: else:
ds = de.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle,
schema=schema, num_parallel_workers=8, num_samples=3000)
ds = ds.batch(int(batch_size / line_per_sample), drop_remainder=True)
ds = ds.map(operations=(lambda x, y, z: (
data_set = ds.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle,
schema=schema, num_parallel_workers=8, num_samples=3000)
data_set = data_set.batch(int(batch_size / line_per_sample), drop_remainder=True)
data_set = data_set.map(operations=(lambda x, y, z: (
np.array(x).flatten().reshape(batch_size, 39), np.array(x).flatten().reshape(batch_size, 39),
np.array(y).flatten().reshape(batch_size, 39), np.array(y).flatten().reshape(batch_size, 39),
np.array(z).flatten().reshape(batch_size, 1))), np.array(z).flatten().reshape(batch_size, 1))),
input_columns=['feat_ids', 'feat_vals', 'label'],
column_order=['feat_ids', 'feat_vals', 'label'],
num_parallel_workers=8)
ds = ds.repeat(epochs)
return ds
input_columns=['feat_ids', 'feat_vals', 'label'],
column_order=['feat_ids', 'feat_vals', 'label'],
num_parallel_workers=8)
data_set = data_set.repeat(epochs)
return data_set




def create_dataset(directory, train_mode=True, epochs=1, batch_size=1000, def create_dataset(directory, train_mode=True, epochs=1, batch_size=1000,


+ 24
- 16
tests/st/model_zoo_tests/transformer/test_transformer.py View File

@@ -24,17 +24,18 @@ from mindspore.nn.optim import Adam
from mindspore.train.model import Model from mindspore.train.model import Model
from mindspore.train.loss_scale_manager import DynamicLossScaleManager from mindspore.train.loss_scale_manager import DynamicLossScaleManager
from mindspore.train.callback import Callback from mindspore.train.callback import Callback
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.transforms.c_transforms as deC import mindspore.dataset.transforms.c_transforms as deC
from mindspore import context from mindspore import context
from model_zoo.official.nlp.transformer.src.transformer_model import TransformerConfig from model_zoo.official.nlp.transformer.src.transformer_model import TransformerConfig
from model_zoo.official.nlp.transformer.src.transformer_for_train import TransformerNetworkWithLoss, \ from model_zoo.official.nlp.transformer.src.transformer_for_train import TransformerNetworkWithLoss, \
TransformerTrainOneStepWithLossScaleCell
TransformerTrainOneStepWithLossScaleCell
from model_zoo.official.nlp.transformer.src.config import cfg, transformer_net_cfg from model_zoo.official.nlp.transformer.src.config import cfg, transformer_net_cfg
from model_zoo.official.nlp.transformer.src.lr_schedule import create_dynamic_lr from model_zoo.official.nlp.transformer.src.lr_schedule import create_dynamic_lr


DATA_DIR = ["/home/workspace/mindspore_dataset/transformer/test-mindrecord"] DATA_DIR = ["/home/workspace/mindspore_dataset/transformer/test-mindrecord"]



def get_config(version='base', batch_size=1): def get_config(version='base', batch_size=1):
"""get config""" """get config"""
if version == 'large': if version == 'large':
@@ -75,23 +76,25 @@ def get_config(version='base', batch_size=1):
transformer_cfg = TransformerConfig(batch_size=batch_size) transformer_cfg = TransformerConfig(batch_size=batch_size)
return transformer_cfg return transformer_cfg



def load_test_data(batch_size=1, data_file=None): def load_test_data(batch_size=1, data_file=None):
"""Load test dataset.""" """Load test dataset."""
ds = de.MindDataset(data_file,
columns_list=["source_eos_ids", "source_eos_mask",
"target_sos_ids", "target_sos_mask",
"target_eos_ids", "target_eos_mask"],
shuffle=False)
data_set = ds.MindDataset(data_file,
columns_list=["source_eos_ids", "source_eos_mask",
"target_sos_ids", "target_sos_mask",
"target_eos_ids", "target_eos_mask"],
shuffle=False)
type_cast_op = deC.TypeCast(mstype.int32) type_cast_op = deC.TypeCast(mstype.int32)
ds = ds.map(operations=type_cast_op, input_columns="source_eos_ids")
ds = ds.map(operations=type_cast_op, input_columns="source_eos_mask")
ds = ds.map(operations=type_cast_op, input_columns="target_sos_ids")
ds = ds.map(operations=type_cast_op, input_columns="target_sos_mask")
ds = ds.map(operations=type_cast_op, input_columns="target_eos_ids")
ds = ds.map(operations=type_cast_op, input_columns="target_eos_mask")
data_set = data_set.map(operations=type_cast_op, input_columns="source_eos_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="source_eos_mask")
data_set = data_set.map(operations=type_cast_op, input_columns="target_sos_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="target_sos_mask")
data_set = data_set.map(operations=type_cast_op, input_columns="target_eos_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="target_eos_mask")
# apply batch operations # apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
return ds
data_set = data_set.batch(batch_size, drop_remainder=True)
return data_set



class ModelCallback(Callback): class ModelCallback(Callback):
def __init__(self): def __init__(self):
@@ -107,13 +110,16 @@ class ModelCallback(Callback):
self.lossscale_list.append(cb_params.net_outputs[2].asnumpy()) self.lossscale_list.append(cb_params.net_outputs[2].asnumpy())
print("epoch: {}, outputs are: {}".format(cb_params.cur_epoch_num, str(cb_params.net_outputs))) print("epoch: {}, outputs are: {}".format(cb_params.cur_epoch_num, str(cb_params.net_outputs)))



class TimeMonitor(Callback): class TimeMonitor(Callback):
"""Time Monitor.""" """Time Monitor."""

def __init__(self, data_size): def __init__(self, data_size):
super(TimeMonitor, self).__init__() super(TimeMonitor, self).__init__()
self.data_size = data_size self.data_size = data_size
self.epoch_mseconds_list = [] self.epoch_mseconds_list = []
self.per_step_mseconds_list = [] self.per_step_mseconds_list = []

def epoch_begin(self, run_context): def epoch_begin(self, run_context):
self.epoch_time = time.time() self.epoch_time = time.time()


@@ -122,6 +128,7 @@ class TimeMonitor(Callback):
self.epoch_mseconds_list.append(epoch_mseconds) self.epoch_mseconds_list.append(epoch_mseconds)
self.per_step_mseconds_list.append(epoch_mseconds / self.data_size) self.per_step_mseconds_list.append(epoch_mseconds / self.data_size)



@pytest.mark.level0 @pytest.mark.level0
@pytest.mark.platform_arm_ascend_training @pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training @pytest.mark.platform_x86_ascend_training
@@ -142,7 +149,7 @@ def test_transformer():
netwithloss = TransformerNetworkWithLoss(config, True) netwithloss = TransformerNetworkWithLoss(config, True)


lr = Tensor(create_dynamic_lr(schedule="constant*rsqrt_hidden*linear_warmup*rsqrt_decay", lr = Tensor(create_dynamic_lr(schedule="constant*rsqrt_hidden*linear_warmup*rsqrt_decay",
training_steps=dataset.get_dataset_size()*epoch_size,
training_steps=dataset.get_dataset_size() * epoch_size,
learning_rate=cfg.lr_schedule.learning_rate, learning_rate=cfg.lr_schedule.learning_rate,
warmup_steps=cfg.lr_schedule.warmup_steps, warmup_steps=cfg.lr_schedule.warmup_steps,
hidden_size=config.hidden_size), mstype.float32) hidden_size=config.hidden_size), mstype.float32)
@@ -193,5 +200,6 @@ def test_transformer():
print("per step mseconds: {}".format(per_step_mseconds)) print("per step mseconds: {}".format(per_step_mseconds))
assert per_step_mseconds <= expect_per_step_mseconds + 2 assert per_step_mseconds <= expect_per_step_mseconds + 2



if __name__ == '__main__': if __name__ == '__main__':
test_transformer() test_transformer()

+ 34
- 30
tests/st/model_zoo_tests/wide_and_deep/python_file_for_ci/datasets.py View File

@@ -14,13 +14,13 @@
# ============================================================================ # ============================================================================
"""train_imagenet.""" """train_imagenet."""



import os import os
from enum import Enum from enum import Enum
import numpy as np import numpy as np
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.common.dtype as mstype import mindspore.common.dtype as mstype



class DataType(Enum): class DataType(Enum):
""" """
Enumerate supported dataset format. Enumerate supported dataset format.
@@ -29,6 +29,7 @@ class DataType(Enum):
TFRECORD = 2 TFRECORD = 2
H5 = 3 H5 = 3



def _get_tf_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000, def _get_tf_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000,
line_per_sample=1000, rank_size=None, rank_id=None): line_per_sample=1000, rank_size=None, rank_id=None):
""" """
@@ -41,26 +42,29 @@ def _get_tf_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000,
for filename in filenames: for filename in filenames:
if file_prefix_name in filename and "tfrecord" in filename: if file_prefix_name in filename and "tfrecord" in filename:
dataset_files.append(os.path.join(dirpath, filename)) dataset_files.append(os.path.join(dirpath, filename))
schema = de.Schema()
schema = ds.Schema()
schema.add_column('feat_ids', de_type=mstype.int32) schema.add_column('feat_ids', de_type=mstype.int32)
schema.add_column('feat_vals', de_type=mstype.float32) schema.add_column('feat_vals', de_type=mstype.float32)
schema.add_column('label', de_type=mstype.float32) schema.add_column('label', de_type=mstype.float32)
if rank_size is not None and rank_id is not None: if rank_size is not None and rank_id is not None:
ds = de.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, schema=schema, num_parallel_workers=8,
num_shards=rank_size, shard_id=rank_id, shard_equal_rows=True)
data_set = ds.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, schema=schema,
num_parallel_workers=8,
num_shards=rank_size, shard_id=rank_id, shard_equal_rows=True)
else: else:
ds = de.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, schema=schema, num_parallel_workers=8)
ds = ds.batch(int(batch_size / line_per_sample),
drop_remainder=True)
ds = ds.map(operations=(lambda x, y, z: (
data_set = ds.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, schema=schema,
num_parallel_workers=8)
data_set = data_set.batch(int(batch_size / line_per_sample),
drop_remainder=True)
data_set = data_set.map(operations=(lambda x, y, z: (
np.array(x).flatten().reshape(batch_size, 39), np.array(x).flatten().reshape(batch_size, 39),
np.array(y).flatten().reshape(batch_size, 39), np.array(y).flatten().reshape(batch_size, 39),
np.array(z).flatten().reshape(batch_size, 1))), np.array(z).flatten().reshape(batch_size, 1))),
input_columns=['feat_ids', 'feat_vals', 'label'],
column_order=['feat_ids', 'feat_vals', 'label'], num_parallel_workers=8)
#if train_mode:
ds = ds.repeat(epochs)
return ds
input_columns=['feat_ids', 'feat_vals', 'label'],
column_order=['feat_ids', 'feat_vals', 'label'], num_parallel_workers=8)
# if train_mode:
data_set = data_set.repeat(epochs)
return data_set



def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=1000, def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=1000,
line_per_sample=1000, rank_size=None, rank_id=None): line_per_sample=1000, rank_size=None, rank_id=None):
@@ -84,23 +88,23 @@ def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=100
shuffle = train_mode shuffle = train_mode


if rank_size is not None and rank_id is not None: if rank_size is not None and rank_id is not None:
ds = de.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name),
columns_list=['feat_ids', 'feat_vals', 'label'],
num_shards=rank_size, shard_id=rank_id, shuffle=shuffle,
num_parallel_workers=8)
data_set = ds.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name),
columns_list=['feat_ids', 'feat_vals', 'label'],
num_shards=rank_size, shard_id=rank_id, shuffle=shuffle,
num_parallel_workers=8)
else: else:
ds = de.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name),
columns_list=['feat_ids', 'feat_vals', 'label'],
shuffle=shuffle, num_parallel_workers=8)
ds = ds.batch(int(batch_size / line_per_sample), drop_remainder=True)
ds = ds.map(operations=(lambda x, y, z: (np.array(x).flatten().reshape(batch_size, 39),
np.array(y).flatten().reshape(batch_size, 39),
np.array(z).flatten().reshape(batch_size, 1))),
input_columns=['feat_ids', 'feat_vals', 'label'],
column_order=['feat_ids', 'feat_vals', 'label'],
num_parallel_workers=8)
ds = ds.repeat(epochs)
return ds
data_set = ds.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name),
columns_list=['feat_ids', 'feat_vals', 'label'],
shuffle=shuffle, num_parallel_workers=8)
data_set = data_set.batch(int(batch_size / line_per_sample), drop_remainder=True)
data_set = data_set.map(operations=(lambda x, y, z: (np.array(x).flatten().reshape(batch_size, 39),
np.array(y).flatten().reshape(batch_size, 39),
np.array(z).flatten().reshape(batch_size, 1))),
input_columns=['feat_ids', 'feat_vals', 'label'],
column_order=['feat_ids', 'feat_vals', 'label'],
num_parallel_workers=8)
data_set = data_set.repeat(epochs)
return data_set




def create_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000, def create_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000,


+ 21
- 17
tests/st/networks/models/bert/bert_performance/test_bert_tdt_lossscale.py View File

@@ -20,7 +20,7 @@ import time
import numpy as np import numpy as np
import pytest import pytest
import mindspore.common.dtype as mstype import mindspore.common.dtype as mstype
import mindspore.dataset.engine.datasets as de
import mindspore.dataset as ds
import mindspore.dataset.transforms.c_transforms as C import mindspore.dataset.transforms.c_transforms as C
from mindspore import context from mindspore import context
from mindspore import log as logger from mindspore import log as logger
@@ -35,7 +35,6 @@ from model_zoo.official.nlp.bert.src.bert_for_pre_training import BertNetworkWit
from model_zoo.official.nlp.bert.src.bert_for_pre_training import BertTrainOneStepWithLossScaleCell from model_zoo.official.nlp.bert.src.bert_for_pre_training import BertTrainOneStepWithLossScaleCell
from model_zoo.official.nlp.bert.src.bert_model import BertConfig from model_zoo.official.nlp.bert.src.bert_model import BertConfig



_current_dir = os.path.dirname(os.path.realpath(__file__)) _current_dir = os.path.dirname(os.path.realpath(__file__))
DATA_DIR = ["/home/workspace/mindspore_dataset/bert/example/examples.tfrecord"] DATA_DIR = ["/home/workspace/mindspore_dataset/bert/example/examples.tfrecord"]
SCHEMA_DIR = "/home/workspace/mindspore_dataset/bert/example/datasetSchema.json" SCHEMA_DIR = "/home/workspace/mindspore_dataset/bert/example/datasetSchema.json"
@@ -88,25 +87,26 @@ def me_de_train_dataset(sink_mode=False):
repeat_count = 1 repeat_count = 1
sink_size = -1 sink_size = -1
batch_size = 16 batch_size = 16
ds = de.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["input_ids", "input_mask", "segment_ids",
"next_sentence_labels", "masked_lm_positions",
"masked_lm_ids", "masked_lm_weights"], shuffle=False)
data_set = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["input_ids", "input_mask", "segment_ids",
"next_sentence_labels", "masked_lm_positions",
"masked_lm_ids", "masked_lm_weights"],
shuffle=False)
type_cast_op = C.TypeCast(mstype.int32) type_cast_op = C.TypeCast(mstype.int32)
new_repeat_count = repeat_count new_repeat_count = repeat_count
if sink_mode: if sink_mode:
sink_size = 100 sink_size = 100
new_repeat_count = 3 new_repeat_count = 3
ds = ds.map(operations=type_cast_op, input_columns="masked_lm_ids")
ds = ds.map(operations=type_cast_op, input_columns="masked_lm_positions")
ds = ds.map(operations=type_cast_op, input_columns="next_sentence_labels")
ds = ds.map(operations=type_cast_op, input_columns="segment_ids")
ds = ds.map(operations=type_cast_op, input_columns="input_mask")
ds = ds.map(operations=type_cast_op, input_columns="input_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_positions")
data_set = data_set.map(operations=type_cast_op, input_columns="next_sentence_labels")
data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="input_mask")
data_set = data_set.map(operations=type_cast_op, input_columns="input_ids")
# apply batch operations # apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
logger.info("data size: {}".format(ds.get_dataset_size()))
logger.info("repeat_count: {}".format(ds.get_repeat_count()))
return ds, new_repeat_count, sink_size
data_set = data_set.batch(batch_size, drop_remainder=True)
logger.info("data size: {}".format(data_set.get_dataset_size()))
logger.info("repeat_count: {}".format(data_set.get_repeat_count()))
return data_set, new_repeat_count, sink_size




def weight_variable(shape): def weight_variable(shape):
@@ -155,13 +155,16 @@ class ModelCallback(Callback):
self.lossscale_list.append(cb_params.net_outputs[2].asnumpy()) self.lossscale_list.append(cb_params.net_outputs[2].asnumpy())
print("epoch: {}, outputs are: {}".format(cb_params.cur_epoch_num, str(cb_params.net_outputs))) print("epoch: {}, outputs are: {}".format(cb_params.cur_epoch_num, str(cb_params.net_outputs)))



class TimeMonitor(Callback): class TimeMonitor(Callback):
"""Time Monitor.""" """Time Monitor."""

def __init__(self, data_size): def __init__(self, data_size):
super(TimeMonitor, self).__init__() super(TimeMonitor, self).__init__()
self.data_size = data_size self.data_size = data_size
self.epoch_mseconds_list = [] self.epoch_mseconds_list = []
self.per_step_mseconds_list = [] self.per_step_mseconds_list = []

def epoch_begin(self, run_context): def epoch_begin(self, run_context):
self.epoch_time = time.time() self.epoch_time = time.time()


@@ -178,7 +181,7 @@ class TimeMonitor(Callback):
def test_bert_performance(): def test_bert_performance():
"""test bert performance""" """test bert performance"""
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", reserve_class_name_in_scope=False) context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", reserve_class_name_in_scope=False)
ds, new_repeat_count, sink_size = me_de_train_dataset(sink_mode=True)
data_set, new_repeat_count, sink_size = me_de_train_dataset(sink_mode=True)
version = os.getenv('VERSION', 'large') version = os.getenv('VERSION', 'large')
config = get_config(version=version) config = get_config(version=version)
netwithloss = BertNetworkWithLoss(config, True) netwithloss = BertNetworkWithLoss(config, True)
@@ -221,7 +224,7 @@ def test_bert_performance():
logger.info("***************** BERT param name is 3 {}".format(name)) logger.info("***************** BERT param name is 3 {}".format(name))
param.set_data(weight_variable(value.asnumpy().shape)) param.set_data(weight_variable(value.asnumpy().shape))
time_monitor_callback = TimeMonitor(sink_size) time_monitor_callback = TimeMonitor(sink_size)
model.train(new_repeat_count, ds, callbacks=[time_monitor_callback, callback],
model.train(new_repeat_count, data_set, callbacks=[time_monitor_callback, callback],
dataset_sink_mode=True, sink_size=sink_size) dataset_sink_mode=True, sink_size=sink_size)


# assertion occurs while the loss value, overflow state or loss_scale value is wrong # assertion occurs while the loss value, overflow state or loss_scale value is wrong
@@ -250,5 +253,6 @@ def test_bert_performance():
print("per step mseconds: {}".format(per_step_mseconds)) print("per step mseconds: {}".format(per_step_mseconds))
assert per_step_mseconds <= expect_per_step_mseconds + 1 assert per_step_mseconds <= expect_per_step_mseconds + 1



if __name__ == '__main__': if __name__ == '__main__':
test_bert_performance() test_bert_performance()

+ 27
- 23
tests/st/networks/models/bert/bert_performance/test_bert_thor_mlperf.py View File

@@ -20,7 +20,7 @@ import time
from multiprocessing import Process, Queue from multiprocessing import Process, Queue
import pytest import pytest
import numpy as np import numpy as np
import mindspore.dataset as dataset
import mindspore.dataset as ds
import mindspore.common.dtype as mstype import mindspore.common.dtype as mstype
import mindspore.communication.management as D import mindspore.communication.management as D
from mindspore import context from mindspore import context
@@ -28,7 +28,6 @@ from mindspore import log as logger
from mindspore.train.callback import Callback from mindspore.train.callback import Callback
from mindspore.context import ParallelMode from mindspore.context import ParallelMode
from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore.train.serialization import load_checkpoint, load_param_into_net
import mindspore.dataset.engine.datasets as de
import mindspore.dataset.transforms.c_transforms as C import mindspore.dataset.transforms.c_transforms as C
from model_zoo.official.nlp.bert_thor.src.bert_for_pre_training import BertNetworkWithLoss, BertTrainOneStepCell from model_zoo.official.nlp.bert_thor.src.bert_for_pre_training import BertNetworkWithLoss, BertTrainOneStepCell
from model_zoo.official.nlp.bert_thor.src.bert_net_config import bert_net_cfg from model_zoo.official.nlp.bert_thor.src.bert_net_config import bert_net_cfg
@@ -45,11 +44,13 @@ train_steps = 200
batch_size = 12 batch_size = 12


np.random.seed(1) np.random.seed(1)
dataset.config.set_seed(1)
ds.config.set_seed(1)
os.environ['GLOG_v'] = str(2) os.environ['GLOG_v'] = str(2)



class TimeMonitor(Callback): class TimeMonitor(Callback):
"""Time Monitor.""" """Time Monitor."""

def __init__(self, data_size): def __init__(self, data_size):
super(TimeMonitor, self).__init__() super(TimeMonitor, self).__init__()
self.data_size = data_size self.data_size = data_size
@@ -67,6 +68,7 @@ class TimeMonitor(Callback):
self.per_step_mseconds_list.append(per_step_mseconds) self.per_step_mseconds_list.append(per_step_mseconds)
print("epoch: {}, per_step_mseconds are {}".format(cb_params.cur_epoch_num, str(per_step_mseconds)), flush=True) print("epoch: {}, per_step_mseconds are {}".format(cb_params.cur_epoch_num, str(per_step_mseconds)), flush=True)



class LossCallback(Callback): class LossCallback(Callback):
def __init__(self): def __init__(self):
super(LossCallback, self).__init__() super(LossCallback, self).__init__()
@@ -78,6 +80,7 @@ class LossCallback(Callback):
print("epoch: {}, step: {}, outputs are {}".format(cb_params.cur_epoch_num, cb_params.cur_step_num, print("epoch: {}, step: {}, outputs are {}".format(cb_params.cur_epoch_num, cb_params.cur_step_num,
str(cb_params.net_outputs)), flush=True) str(cb_params.net_outputs)), flush=True)



def create_bert_dataset(device_num=1, rank=0, do_shuffle="true", data_dir=None, schema_dir=None): def create_bert_dataset(device_num=1, rank=0, do_shuffle="true", data_dir=None, schema_dir=None):
"""create train dataset""" """create train dataset"""
# apply repeat operations # apply repeat operations
@@ -87,25 +90,25 @@ def create_bert_dataset(device_num=1, rank=0, do_shuffle="true", data_dir=None,
if "tfrecord" in file_name: if "tfrecord" in file_name:
data_files.append(os.path.join(data_dir, file_name)) data_files.append(os.path.join(data_dir, file_name))
data_files = sorted(data_files) data_files = sorted(data_files)
ds = de.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None,
columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels",
"masked_lm_positions", "masked_lm_ids", "masked_lm_weights"],
shuffle=de.Shuffle.FILES if do_shuffle == "true" else False,
num_shards=device_num, shard_id=rank, shard_equal_rows=True)
ori_dataset_size = ds.get_dataset_size()
data_set = ds.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None,
columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels",
"masked_lm_positions", "masked_lm_ids", "masked_lm_weights"],
shuffle=ds.Shuffle.FILES if do_shuffle == "true" else False,
num_shards=device_num, shard_id=rank, shard_equal_rows=True)
ori_dataset_size = data_set.get_dataset_size()
print('origin dataset size: ', ori_dataset_size) print('origin dataset size: ', ori_dataset_size)
type_cast_op = C.TypeCast(mstype.int32) type_cast_op = C.TypeCast(mstype.int32)
ds = ds.map(operations=type_cast_op, input_columns="masked_lm_ids")
ds = ds.map(operations=type_cast_op, input_columns="masked_lm_positions")
ds = ds.map(operations=type_cast_op, input_columns="next_sentence_labels")
ds = ds.map(operations=type_cast_op, input_columns="segment_ids")
ds = ds.map(operations=type_cast_op, input_columns="input_mask")
ds = ds.map(operations=type_cast_op, input_columns="input_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_positions")
data_set = data_set.map(operations=type_cast_op, input_columns="next_sentence_labels")
data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="input_mask")
data_set = data_set.map(operations=type_cast_op, input_columns="input_ids")
# apply batch operations # apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
logger.info("data size: {}".format(ds.get_dataset_size()))
logger.info("repeat count: {}".format(ds.get_repeat_count()))
return ds
data_set = data_set.batch(batch_size, drop_remainder=True)
logger.info("data size: {}".format(data_set.get_dataset_size()))
logger.info("repeat count: {}".format(data_set.get_repeat_count()))
return data_set




def _set_bert_all_reduce_split(): def _set_bert_all_reduce_split():
@@ -151,13 +154,13 @@ def train_process_bert_thor(q, device_id, epoch_size, device_num):
device_num=device_num) device_num=device_num)


bert_net_cfg.num_hidden_layers = 4 bert_net_cfg.num_hidden_layers = 4
ds = create_bert_dataset(device_num=device_num, rank=rank, do_shuffle=False, data_dir=DATASET_PATH, schema_dir=None)
data_set = create_bert_dataset(device_num=device_num, rank=rank, do_shuffle=False, data_dir=DATASET_PATH,
schema_dir=None)
net_with_loss = BertNetworkWithLoss(bert_net_cfg, True) net_with_loss = BertNetworkWithLoss(bert_net_cfg, True)


new_repeat_count = epoch_size * ds.get_dataset_size() // data_sink_steps
new_repeat_count = epoch_size * data_set.get_dataset_size() // data_sink_steps
new_repeat_count = min(new_repeat_count, train_steps // data_sink_steps) new_repeat_count = min(new_repeat_count, train_steps // data_sink_steps)



lr = get_bert_lr() lr = get_bert_lr()
damping = get_bert_damping() damping = get_bert_damping()
optimizer = THOR(filter(lambda x: x.requires_grad, net_with_loss.get_parameters()), lr, cfg.Thor.momentum, optimizer = THOR(filter(lambda x: x.requires_grad, net_with_loss.get_parameters()), lr, cfg.Thor.momentum,
@@ -175,7 +178,7 @@ def train_process_bert_thor(q, device_id, epoch_size, device_num):


net_with_grads = BertTrainOneStepCell(net_with_loss, optimizer=optimizer) net_with_grads = BertTrainOneStepCell(net_with_loss, optimizer=optimizer)
model = Model(net_with_grads, frequency=cfg.Thor.frequency) model = Model(net_with_grads, frequency=cfg.Thor.frequency)
model.train(new_repeat_count, ds, callbacks=callback, dataset_sink_mode=True, sink_size=data_sink_steps)
model.train(new_repeat_count, data_set, callbacks=callback, dataset_sink_mode=True, sink_size=data_sink_steps)


loss_list = loss_callback.loss_list loss_list = loss_callback.loss_list
per_step_mseconds = time_monitor_callback.per_step_mseconds_list per_step_mseconds = time_monitor_callback.per_step_mseconds_list
@@ -230,5 +233,6 @@ def test_bert_thor_mlperf_8p():
assert mean_cost < 64.2 assert mean_cost < 64.2
assert mean_loss < 7.9 assert mean_loss < 7.9



if __name__ == '__main__': if __name__ == '__main__':
test_bert_thor_mlperf_8p() test_bert_thor_mlperf_8p()

+ 18
- 17
tests/st/networks/models/bert/bert_precision/test_bert_tdt_lossscale.py View File

@@ -20,7 +20,7 @@ import time
import numpy as np import numpy as np
import pytest import pytest
import mindspore.common.dtype as mstype import mindspore.common.dtype as mstype
import mindspore.dataset.engine.datasets as de
import mindspore.dataset as ds
import mindspore.dataset.transforms.c_transforms as C import mindspore.dataset.transforms.c_transforms as C
from mindspore import context from mindspore import context
from mindspore import log as logger from mindspore import log as logger
@@ -87,25 +87,26 @@ def me_de_train_dataset(sink_mode=False):
repeat_count = 1 repeat_count = 1
sink_size = -1 sink_size = -1
batch_size = 16 batch_size = 16
ds = de.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["input_ids", "input_mask", "segment_ids",
"next_sentence_labels", "masked_lm_positions",
"masked_lm_ids", "masked_lm_weights"], shuffle=False)
data_set = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["input_ids", "input_mask", "segment_ids",
"next_sentence_labels", "masked_lm_positions",
"masked_lm_ids", "masked_lm_weights"],
shuffle=False)
type_cast_op = C.TypeCast(mstype.int32) type_cast_op = C.TypeCast(mstype.int32)
new_repeat_count = repeat_count new_repeat_count = repeat_count
if sink_mode: if sink_mode:
sink_size = 100 sink_size = 100
new_repeat_count = 3 new_repeat_count = 3
ds = ds.map(operations=type_cast_op, input_columns="masked_lm_ids")
ds = ds.map(operations=type_cast_op, input_columns="masked_lm_positions")
ds = ds.map(operations=type_cast_op, input_columns="next_sentence_labels")
ds = ds.map(operations=type_cast_op, input_columns="segment_ids")
ds = ds.map(operations=type_cast_op, input_columns="input_mask")
ds = ds.map(operations=type_cast_op, input_columns="input_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_positions")
data_set = data_set.map(operations=type_cast_op, input_columns="next_sentence_labels")
data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="input_mask")
data_set = data_set.map(operations=type_cast_op, input_columns="input_ids")
# apply batch operations # apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
logger.info("data size: {}".format(ds.get_dataset_size()))
logger.info("repeat_count: {}".format(ds.get_repeat_count()))
return ds, new_repeat_count, sink_size
data_set = data_set.batch(batch_size, drop_remainder=True)
logger.info("data size: {}".format(data_set.get_dataset_size()))
logger.info("repeat_count: {}".format(data_set.get_repeat_count()))
return data_set, new_repeat_count, sink_size




def weight_variable(shape): def weight_variable(shape):
@@ -178,11 +179,11 @@ def test_bert_percision(enable_graph_kernel=False):
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", reserve_class_name_in_scope=False) context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", reserve_class_name_in_scope=False)
if enable_graph_kernel: if enable_graph_kernel:
context.set_context(enable_graph_kernel=True) context.set_context(enable_graph_kernel=True)
ds, new_repeat_count, _ = me_de_train_dataset()
data_set, new_repeat_count, _ = me_de_train_dataset()
version = os.getenv('VERSION', 'large') version = os.getenv('VERSION', 'large')
config = get_config(version=version) config = get_config(version=version)
netwithloss = BertNetworkWithLoss(config, True) netwithloss = BertNetworkWithLoss(config, True)
lr = BertLearningRate(decay_steps=ds.get_dataset_size() * new_repeat_count,
lr = BertLearningRate(decay_steps=data_set.get_dataset_size() * new_repeat_count,
learning_rate=5e-5, end_learning_rate=1e-9, learning_rate=5e-5, end_learning_rate=1e-9,
power=10.0, warmup_steps=0) power=10.0, warmup_steps=0)
decay_filter = lambda x: 'layernorm' not in x.name.lower() and 'bias' not in x.name.lower() decay_filter = lambda x: 'layernorm' not in x.name.lower() and 'bias' not in x.name.lower()
@@ -218,7 +219,7 @@ def test_bert_percision(enable_graph_kernel=False):
else: else:
logger.info("***************** BERT param name is 3 {}".format(name)) logger.info("***************** BERT param name is 3 {}".format(name))
param.set_data(weight_variable(value.asnumpy().shape)) param.set_data(weight_variable(value.asnumpy().shape))
model.train(new_repeat_count, ds, callbacks=callback, dataset_sink_mode=False)
model.train(new_repeat_count, data_set, callbacks=callback, dataset_sink_mode=False)


# assertion occurs while the loss value, overflow state or loss_scale value is wrong # assertion occurs while the loss value, overflow state or loss_scale value is wrong
loss_value = np.array(callback.loss_list) loss_value = np.array(callback.loss_list)


+ 19
- 19
tests/st/networks/models/bert/src/dataset.py View File

@@ -17,7 +17,7 @@ Data operations, will be used in run_pretrain.py
""" """
import os import os
import mindspore.common.dtype as mstype import mindspore.common.dtype as mstype
import mindspore.dataset.engine.datasets as de
import mindspore.dataset as ds
import mindspore.dataset.transforms.c_transforms as C import mindspore.dataset.transforms.c_transforms as C
from mindspore import log as logger from mindspore import log as logger
from .config import bert_net_cfg from .config import bert_net_cfg
@@ -32,24 +32,24 @@ def create_bert_dataset(epoch_size=1, device_num=1, rank=0, do_shuffle="true", d
for file_name in files: for file_name in files:
if "tfrecord" in file_name: if "tfrecord" in file_name:
data_files.append(os.path.join(data_dir, file_name)) data_files.append(os.path.join(data_dir, file_name))
ds = de.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None,
columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels",
"masked_lm_positions", "masked_lm_ids", "masked_lm_weights"],
shuffle=(do_shuffle == "true"), num_shards=device_num, shard_id=rank,
shard_equal_rows=True)
ori_dataset_size = ds.get_dataset_size()
data_set = ds.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None,
columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels",
"masked_lm_positions", "masked_lm_ids", "masked_lm_weights"],
shuffle=(do_shuffle == "true"), num_shards=device_num, shard_id=rank,
shard_equal_rows=True)
ori_dataset_size = data_set.get_dataset_size()
print('origin dataset size: ', ori_dataset_size) print('origin dataset size: ', ori_dataset_size)
new_repeat_count = int(repeat_count * ori_dataset_size // ds.get_dataset_size())
new_repeat_count = int(repeat_count * ori_dataset_size // data_set.get_dataset_size())
type_cast_op = C.TypeCast(mstype.int32) type_cast_op = C.TypeCast(mstype.int32)
ds = ds.map(operations=type_cast_op, input_columns="masked_lm_ids")
ds = ds.map(operations=type_cast_op, input_columns="masked_lm_positions")
ds = ds.map(operations=type_cast_op, input_columns="next_sentence_labels")
ds = ds.map(operations=type_cast_op, input_columns="segment_ids")
ds = ds.map(operations=type_cast_op, input_columns="input_mask")
ds = ds.map(operations=type_cast_op, input_columns="input_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_positions")
data_set = data_set.map(operations=type_cast_op, input_columns="next_sentence_labels")
data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="input_mask")
data_set = data_set.map(operations=type_cast_op, input_columns="input_ids")
# apply batch operations # apply batch operations
ds = ds.batch(bert_net_cfg.batch_size, drop_remainder=True)
ds = ds.repeat(max(new_repeat_count, repeat_count))
logger.info("data size: {}".format(ds.get_dataset_size()))
logger.info("repeatcount: {}".format(ds.get_repeat_count()))
return ds, new_repeat_count
data_set = data_set.batch(bert_net_cfg.batch_size, drop_remainder=True)
data_set = data_set.repeat(max(new_repeat_count, repeat_count))
logger.info("data size: {}".format(data_set.get_dataset_size()))
logger.info("repeatcount: {}".format(data_set.get_repeat_count()))
return data_set, new_repeat_count

+ 9
- 10
tests/st/networks/models/resnet50/src/dataset.py View File

@@ -17,7 +17,7 @@


import os import os
import mindspore.common.dtype as mstype import mindspore.common.dtype as mstype
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.vision.c_transforms as C import mindspore.dataset.vision.c_transforms as C
import mindspore.dataset.transforms.c_transforms as C2 import mindspore.dataset.transforms.c_transforms as C2


@@ -39,10 +39,10 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32):
device_num = int(os.getenv("RANK_SIZE")) device_num = int(os.getenv("RANK_SIZE"))
rank_id = int(os.getenv("RANK_ID")) rank_id = int(os.getenv("RANK_ID"))
if device_num == 1: if device_num == 1:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
else: else:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=device_num, shard_id=rank_id)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=device_num, shard_id=rank_id)


image_size = 224 image_size = 224
mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] mean = [0.485 * 255, 0.456 * 255, 0.406 * 255]
@@ -65,15 +65,14 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32):
C.HWC2CHW() C.HWC2CHW()
] ]



type_cast_op = C2.TypeCast(mstype.int32) type_cast_op = C2.TypeCast(mstype.int32)


ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8)
ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)
data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8)
data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)


# apply batch operations # apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
data_set = data_set.batch(batch_size, drop_remainder=True)


# apply dataset repeat operation # apply dataset repeat operation
ds = ds.repeat(repeat_num)
return ds
data_set = data_set.repeat(repeat_num)
return data_set

+ 10
- 11
tests/st/networks/models/resnet50/src_thor/dataset.py View File

@@ -18,12 +18,11 @@
import os import os


import mindspore.common.dtype as mstype import mindspore.common.dtype as mstype
import mindspore.dataset as dataset
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.transforms.c_transforms as C2 import mindspore.dataset.transforms.c_transforms as C2
import mindspore.dataset.vision.c_transforms as C import mindspore.dataset.vision.c_transforms as C


dataset.config.set_seed(1)
ds.config.set_seed(1)




def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32): def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32):
@@ -43,10 +42,10 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32):
device_num = int(os.getenv("RANK_SIZE")) device_num = int(os.getenv("RANK_SIZE"))
rank_id = int(os.getenv("RANK_ID")) rank_id = int(os.getenv("RANK_ID"))
if device_num == 1: if device_num == 1:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
else: else:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=device_num, shard_id=rank_id)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=device_num, shard_id=rank_id)


image_size = 224 image_size = 224
mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] mean = [0.485 * 255, 0.456 * 255, 0.406 * 255]
@@ -71,12 +70,12 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32):


type_cast_op = C2.TypeCast(mstype.int32) type_cast_op = C2.TypeCast(mstype.int32)


ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8)
ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)
data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8)
data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)


# apply batch operations # apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
data_set = data_set.batch(batch_size, drop_remainder=True)


# apply dataset repeat operation # apply dataset repeat operation
ds = ds.repeat(repeat_num)
return ds
data_set = data_set.repeat(repeat_num)
return data_set

+ 10
- 11
tests/st/quantization/resnet50_quant/dataset.py View File

@@ -14,11 +14,10 @@
# ============================================================================ # ============================================================================
""" create train dataset. """ """ create train dataset. """



from functools import partial from functools import partial


import mindspore.common.dtype as mstype import mindspore.common.dtype as mstype
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.transforms.c_transforms as C2 import mindspore.dataset.transforms.c_transforms as C2
import mindspore.dataset.vision.c_transforms as C import mindspore.dataset.vision.c_transforms as C


@@ -37,8 +36,8 @@ def create_dataset(dataset_path, config, repeat_num=1, batch_size=32):
dataset dataset
""" """


load_func = partial(de.Cifar10Dataset, dataset_path)
ds = load_func(num_parallel_workers=8, shuffle=False)
load_func = partial(ds.Cifar10Dataset, dataset_path)
data_set = load_func(num_parallel_workers=8, shuffle=False)


resize_height = config.image_height resize_height = config.image_height
resize_width = config.image_width resize_width = config.image_width
@@ -54,15 +53,15 @@ def create_dataset(dataset_path, config, repeat_num=1, batch_size=32):


type_cast_op = C2.TypeCast(mstype.int32) type_cast_op = C2.TypeCast(mstype.int32)


ds = ds.map(operations=c_trans, input_columns="image",
num_parallel_workers=8)
ds = ds.map(operations=type_cast_op,
input_columns="label", num_parallel_workers=8)
data_set = data_set.map(operations=c_trans, input_columns="image",
num_parallel_workers=8)
data_set = data_set.map(operations=type_cast_op,
input_columns="label", num_parallel_workers=8)


# apply batch operations # apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
data_set = data_set.batch(batch_size, drop_remainder=True)


# apply dataset repeat operation # apply dataset repeat operation
ds = ds.repeat(repeat_num)
data_set = data_set.repeat(repeat_num)


return ds
return data_set

+ 63
- 60
tests/ut/python/dataset/test_autocontrast.py View File

@@ -16,7 +16,7 @@
Testing AutoContrast op in DE Testing AutoContrast op in DE
""" """
import numpy as np import numpy as np
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.transforms.py_transforms import mindspore.dataset.transforms.py_transforms
import mindspore.dataset.vision.py_transforms as F import mindspore.dataset.vision.py_transforms as F
import mindspore.dataset.vision.c_transforms as C import mindspore.dataset.vision.c_transforms as C
@@ -36,13 +36,13 @@ def test_auto_contrast_py(plot=False):
logger.info("Test AutoContrast Python Op") logger.info("Test AutoContrast Python Op")


# Original Images # Original Images
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)


transforms_original = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), transforms_original = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(),
F.Resize((224, 224)), F.Resize((224, 224)),
F.ToTensor()]) F.ToTensor()])


ds_original = ds.map(operations=transforms_original, input_columns="image")
ds_original = data_set.map(operations=transforms_original, input_columns="image")


ds_original = ds_original.batch(512) ds_original = ds_original.batch(512)


@@ -55,7 +55,7 @@ def test_auto_contrast_py(plot=False):
axis=0) axis=0)


# AutoContrast Images # AutoContrast Images
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)


transforms_auto_contrast = \ transforms_auto_contrast = \
mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), mindspore.dataset.transforms.py_transforms.Compose([F.Decode(),
@@ -63,7 +63,7 @@ def test_auto_contrast_py(plot=False):
F.AutoContrast(cutoff=10.0, ignore=[10, 20]), F.AutoContrast(cutoff=10.0, ignore=[10, 20]),
F.ToTensor()]) F.ToTensor()])


ds_auto_contrast = ds.map(operations=transforms_auto_contrast, input_columns="image")
ds_auto_contrast = data_set.map(operations=transforms_auto_contrast, input_columns="image")


ds_auto_contrast = ds_auto_contrast.batch(512) ds_auto_contrast = ds_auto_contrast.batch(512)


@@ -96,15 +96,15 @@ def test_auto_contrast_c(plot=False):
logger.info("Test AutoContrast C Op") logger.info("Test AutoContrast C Op")


# AutoContrast Images # AutoContrast Images
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
ds = ds.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"])
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"])
python_op = F.AutoContrast(cutoff=10.0, ignore=[10, 20]) python_op = F.AutoContrast(cutoff=10.0, ignore=[10, 20])
c_op = C.AutoContrast(cutoff=10.0, ignore=[10, 20]) c_op = C.AutoContrast(cutoff=10.0, ignore=[10, 20])
transforms_op = mindspore.dataset.transforms.py_transforms.Compose([lambda img: F.ToPIL()(img.astype(np.uint8)), transforms_op = mindspore.dataset.transforms.py_transforms.Compose([lambda img: F.ToPIL()(img.astype(np.uint8)),
python_op, python_op,
np.array]) np.array])


ds_auto_contrast_py = ds.map(operations=transforms_op, input_columns="image")
ds_auto_contrast_py = data_set.map(operations=transforms_op, input_columns="image")


ds_auto_contrast_py = ds_auto_contrast_py.batch(512) ds_auto_contrast_py = ds_auto_contrast_py.batch(512)


@@ -116,10 +116,10 @@ def test_auto_contrast_c(plot=False):
image.asnumpy(), image.asnumpy(),
axis=0) axis=0)


ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
ds = ds.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"])
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"])


ds_auto_contrast_c = ds.map(operations=c_op, input_columns="image")
ds_auto_contrast_c = data_set.map(operations=c_op, input_columns="image")


ds_auto_contrast_c = ds_auto_contrast_c.batch(512) ds_auto_contrast_c = ds_auto_contrast_c.batch(512)


@@ -153,8 +153,8 @@ def test_auto_contrast_one_channel_c(plot=False):
logger.info("Test AutoContrast C Op With One Channel Images") logger.info("Test AutoContrast C Op With One Channel Images")


# AutoContrast Images # AutoContrast Images
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
ds = ds.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"])
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"])
python_op = F.AutoContrast() python_op = F.AutoContrast()
c_op = C.AutoContrast() c_op = C.AutoContrast()
# not using F.ToTensor() since it converts to floats # not using F.ToTensor() since it converts to floats
@@ -164,7 +164,7 @@ def test_auto_contrast_one_channel_c(plot=False):
python_op, python_op,
np.array]) np.array])


ds_auto_contrast_py = ds.map(operations=transforms_op, input_columns="image")
ds_auto_contrast_py = data_set.map(operations=transforms_op, input_columns="image")


ds_auto_contrast_py = ds_auto_contrast_py.batch(512) ds_auto_contrast_py = ds_auto_contrast_py.batch(512)


@@ -176,11 +176,11 @@ def test_auto_contrast_one_channel_c(plot=False):
image.asnumpy(), image.asnumpy(),
axis=0) axis=0)


ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
ds = ds.map(operations=[C.Decode(), C.Resize((224, 224)), lambda img: np.array(img[:, :, 0])],
input_columns=["image"])
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224)), lambda img: np.array(img[:, :, 0])],
input_columns=["image"])


ds_auto_contrast_c = ds.map(operations=c_op, input_columns="image")
ds_auto_contrast_c = data_set.map(operations=c_op, input_columns="image")


ds_auto_contrast_c = ds_auto_contrast_c.batch(512) ds_auto_contrast_c = ds_auto_contrast_c.batch(512)


@@ -208,9 +208,9 @@ def test_auto_contrast_mnist_c(plot=False):
Test AutoContrast C op with MNIST dataset (Grayscale images) Test AutoContrast C op with MNIST dataset (Grayscale images)
""" """
logger.info("Test AutoContrast C Op With MNIST Images") logger.info("Test AutoContrast C Op With MNIST Images")
ds = de.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False)
ds_auto_contrast_c = ds.map(operations=C.AutoContrast(cutoff=1, ignore=(0, 255)), input_columns="image")
ds_orig = de.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False)
data_set = ds.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False)
ds_auto_contrast_c = data_set.map(operations=C.AutoContrast(cutoff=1, ignore=(0, 255)), input_columns="image")
ds_orig = ds.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False)


images = [] images = []
images_trans = [] images_trans = []
@@ -236,21 +236,21 @@ def test_auto_contrast_invalid_ignore_param_c():
""" """
logger.info("Test AutoContrast C Op with invalid ignore parameter") logger.info("Test AutoContrast C Op with invalid ignore parameter")
try: try:
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
ds = ds.map(operations=[C.Decode(),
C.Resize((224, 224)),
lambda img: np.array(img[:, :, 0])], input_columns=["image"])
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data_set = data_set.map(operations=[C.Decode(),
C.Resize((224, 224)),
lambda img: np.array(img[:, :, 0])], input_columns=["image"])
# invalid ignore # invalid ignore
ds = ds.map(operations=C.AutoContrast(ignore=255.5), input_columns="image")
data_set = data_set.map(operations=C.AutoContrast(ignore=255.5), input_columns="image")
except TypeError as error: except TypeError as error:
logger.info("Got an exception in DE: {}".format(str(error))) logger.info("Got an exception in DE: {}".format(str(error)))
assert "Argument ignore with value 255.5 is not of type" in str(error) assert "Argument ignore with value 255.5 is not of type" in str(error)
try: try:
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
ds = ds.map(operations=[C.Decode(), C.Resize((224, 224)),
lambda img: np.array(img[:, :, 0])], input_columns=["image"])
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224)),
lambda img: np.array(img[:, :, 0])], input_columns=["image"])
# invalid ignore # invalid ignore
ds = ds.map(operations=C.AutoContrast(ignore=(10, 100)), input_columns="image")
data_set = data_set.map(operations=C.AutoContrast(ignore=(10, 100)), input_columns="image")
except TypeError as error: except TypeError as error:
logger.info("Got an exception in DE: {}".format(str(error))) logger.info("Got an exception in DE: {}".format(str(error)))
assert "Argument ignore with value (10,100) is not of type" in str(error) assert "Argument ignore with value (10,100) is not of type" in str(error)
@@ -262,22 +262,22 @@ def test_auto_contrast_invalid_cutoff_param_c():
""" """
logger.info("Test AutoContrast C Op with invalid cutoff parameter") logger.info("Test AutoContrast C Op with invalid cutoff parameter")
try: try:
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
ds = ds.map(operations=[C.Decode(),
C.Resize((224, 224)),
lambda img: np.array(img[:, :, 0])], input_columns=["image"])
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data_set = data_set.map(operations=[C.Decode(),
C.Resize((224, 224)),
lambda img: np.array(img[:, :, 0])], input_columns=["image"])
# invalid ignore # invalid ignore
ds = ds.map(operations=C.AutoContrast(cutoff=-10.0), input_columns="image")
data_set = data_set.map(operations=C.AutoContrast(cutoff=-10.0), input_columns="image")
except ValueError as error: except ValueError as error:
logger.info("Got an exception in DE: {}".format(str(error))) logger.info("Got an exception in DE: {}".format(str(error)))
assert "Input cutoff is not within the required interval of (0 to 100)." in str(error) assert "Input cutoff is not within the required interval of (0 to 100)." in str(error)
try: try:
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
ds = ds.map(operations=[C.Decode(),
C.Resize((224, 224)),
lambda img: np.array(img[:, :, 0])], input_columns=["image"])
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data_set = data_set.map(operations=[C.Decode(),
C.Resize((224, 224)),
lambda img: np.array(img[:, :, 0])], input_columns=["image"])
# invalid ignore # invalid ignore
ds = ds.map(operations=C.AutoContrast(cutoff=120.0), input_columns="image")
data_set = data_set.map(operations=C.AutoContrast(cutoff=120.0), input_columns="image")
except ValueError as error: except ValueError as error:
logger.info("Got an exception in DE: {}".format(str(error))) logger.info("Got an exception in DE: {}".format(str(error)))
assert "Input cutoff is not within the required interval of (0 to 100)." in str(error) assert "Input cutoff is not within the required interval of (0 to 100)." in str(error)
@@ -289,22 +289,24 @@ def test_auto_contrast_invalid_ignore_param_py():
""" """
logger.info("Test AutoContrast python Op with invalid ignore parameter") logger.info("Test AutoContrast python Op with invalid ignore parameter")
try: try:
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
ds = ds.map(operations=[mindspore.dataset.transforms.py_transforms.Compose([F.Decode(),
F.Resize((224, 224)),
F.AutoContrast(ignore=255.5),
F.ToTensor()])],
input_columns=["image"])
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data_set = data_set.map(operations=[mindspore.dataset.transforms.py_transforms.Compose([F.Decode(),
F.Resize((224, 224)),
F.AutoContrast(
ignore=255.5),
F.ToTensor()])],
input_columns=["image"])
except TypeError as error: except TypeError as error:
logger.info("Got an exception in DE: {}".format(str(error))) logger.info("Got an exception in DE: {}".format(str(error)))
assert "Argument ignore with value 255.5 is not of type" in str(error) assert "Argument ignore with value 255.5 is not of type" in str(error)
try: try:
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
ds = ds.map(operations=[mindspore.dataset.transforms.py_transforms.Compose([F.Decode(),
F.Resize((224, 224)),
F.AutoContrast(ignore=(10, 100)),
F.ToTensor()])],
input_columns=["image"])
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data_set = data_set.map(operations=[mindspore.dataset.transforms.py_transforms.Compose([F.Decode(),
F.Resize((224, 224)),
F.AutoContrast(
ignore=(10, 100)),
F.ToTensor()])],
input_columns=["image"])
except TypeError as error: except TypeError as error:
logger.info("Got an exception in DE: {}".format(str(error))) logger.info("Got an exception in DE: {}".format(str(error)))
assert "Argument ignore with value (10,100) is not of type" in str(error) assert "Argument ignore with value (10,100) is not of type" in str(error)
@@ -316,18 +318,19 @@ def test_auto_contrast_invalid_cutoff_param_py():
""" """
logger.info("Test AutoContrast python Op with invalid cutoff parameter") logger.info("Test AutoContrast python Op with invalid cutoff parameter")
try: try:
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
ds = ds.map(operations=[mindspore.dataset.transforms.py_transforms.Compose([F.Decode(),
F.Resize((224, 224)),
F.AutoContrast(cutoff=-10.0),
F.ToTensor()])],
input_columns=["image"])
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data_set = data_set.map(operations=[mindspore.dataset.transforms.py_transforms.Compose([F.Decode(),
F.Resize((224, 224)),
F.AutoContrast(
cutoff=-10.0),
F.ToTensor()])],
input_columns=["image"])
except ValueError as error: except ValueError as error:
logger.info("Got an exception in DE: {}".format(str(error))) logger.info("Got an exception in DE: {}".format(str(error)))
assert "Input cutoff is not within the required interval of (0 to 100)." in str(error) assert "Input cutoff is not within the required interval of (0 to 100)." in str(error)
try: try:
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
ds = ds.map(
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data_set = data_set.map(
operations=[mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), operations=[mindspore.dataset.transforms.py_transforms.Compose([F.Decode(),
F.Resize((224, 224)), F.Resize((224, 224)),
F.AutoContrast(cutoff=120.0), F.AutoContrast(cutoff=120.0),


+ 25
- 25
tests/ut/python/dataset/test_equalize.py View File

@@ -17,7 +17,7 @@ Testing Equalize op in DE
""" """
import numpy as np import numpy as np


import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.transforms.py_transforms import mindspore.dataset.transforms.py_transforms
import mindspore.dataset.vision.c_transforms as C import mindspore.dataset.vision.c_transforms as C
import mindspore.dataset.vision.py_transforms as F import mindspore.dataset.vision.py_transforms as F
@@ -37,13 +37,13 @@ def test_equalize_py(plot=False):
logger.info("Test Equalize") logger.info("Test Equalize")


# Original Images # Original Images
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)


transforms_original = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), transforms_original = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(),
F.Resize((224, 224)), F.Resize((224, 224)),
F.ToTensor()]) F.ToTensor()])


ds_original = ds.map(operations=transforms_original, input_columns="image")
ds_original = data_set.map(operations=transforms_original, input_columns="image")


ds_original = ds_original.batch(512) ds_original = ds_original.batch(512)


@@ -56,14 +56,14 @@ def test_equalize_py(plot=False):
axis=0) axis=0)


# Color Equalized Images # Color Equalized Images
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)


transforms_equalize = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), transforms_equalize = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(),
F.Resize((224, 224)), F.Resize((224, 224)),
F.Equalize(), F.Equalize(),
F.ToTensor()]) F.ToTensor()])


ds_equalize = ds.map(operations=transforms_equalize, input_columns="image")
ds_equalize = data_set.map(operations=transforms_equalize, input_columns="image")


ds_equalize = ds_equalize.batch(512) ds_equalize = ds_equalize.batch(512)


@@ -92,11 +92,11 @@ def test_equalize_c(plot=False):
logger.info("Test Equalize cpp op") logger.info("Test Equalize cpp op")


# Original Images # Original Images
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)


transforms_original = [C.Decode(), C.Resize(size=[224, 224])] transforms_original = [C.Decode(), C.Resize(size=[224, 224])]


ds_original = ds.map(operations=transforms_original, input_columns="image")
ds_original = data_set.map(operations=transforms_original, input_columns="image")


ds_original = ds_original.batch(512) ds_original = ds_original.batch(512)


@@ -109,12 +109,12 @@ def test_equalize_c(plot=False):
axis=0) axis=0)


# Equalize Images # Equalize Images
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)


transform_equalize = [C.Decode(), C.Resize(size=[224, 224]), transform_equalize = [C.Decode(), C.Resize(size=[224, 224]),
C.Equalize()] C.Equalize()]


ds_equalize = ds.map(operations=transform_equalize, input_columns="image")
ds_equalize = data_set.map(operations=transform_equalize, input_columns="image")


ds_equalize = ds_equalize.batch(512) ds_equalize = ds_equalize.batch(512)


@@ -142,10 +142,10 @@ def test_equalize_py_c(plot=False):
logger.info("Test Equalize cpp and python op") logger.info("Test Equalize cpp and python op")


# equalize Images in cpp # equalize Images in cpp
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
ds = ds.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"])
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"])


ds_c_equalize = ds.map(operations=C.Equalize(), input_columns="image")
ds_c_equalize = data_set.map(operations=C.Equalize(), input_columns="image")


ds_c_equalize = ds_c_equalize.batch(512) ds_c_equalize = ds_c_equalize.batch(512)


@@ -158,15 +158,15 @@ def test_equalize_py_c(plot=False):
axis=0) axis=0)


# Equalize images in python # Equalize images in python
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
ds = ds.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"])
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"])


transforms_p_equalize = mindspore.dataset.transforms.py_transforms.Compose([lambda img: img.astype(np.uint8), transforms_p_equalize = mindspore.dataset.transforms.py_transforms.Compose([lambda img: img.astype(np.uint8),
F.ToPIL(), F.ToPIL(),
F.Equalize(), F.Equalize(),
np.array]) np.array])


ds_p_equalize = ds.map(operations=transforms_p_equalize, input_columns="image")
ds_p_equalize = data_set.map(operations=transforms_p_equalize, input_columns="image")


ds_p_equalize = ds_p_equalize.batch(512) ds_p_equalize = ds_p_equalize.batch(512)


@@ -197,11 +197,11 @@ def test_equalize_one_channel():
c_op = C.Equalize() c_op = C.Equalize()


try: try:
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
ds = ds.map(operations=[C.Decode(), C.Resize((224, 224)),
lambda img: np.array(img[:, :, 0])], input_columns=["image"])
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224)),
lambda img: np.array(img[:, :, 0])], input_columns=["image"])


ds.map(operations=c_op, input_columns="image")
data_set.map(operations=c_op, input_columns="image")


except RuntimeError as e: except RuntimeError as e:
logger.info("Got an exception in DE: {}".format(str(e))) logger.info("Got an exception in DE: {}".format(str(e)))
@@ -213,9 +213,9 @@ def test_equalize_mnist_c(plot=False):
Test Equalize C op with MNIST dataset (Grayscale images) Test Equalize C op with MNIST dataset (Grayscale images)
""" """
logger.info("Test Equalize C Op With MNIST Images") logger.info("Test Equalize C Op With MNIST Images")
ds = de.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False)
ds_equalize_c = ds.map(operations=C.Equalize(), input_columns="image")
ds_orig = de.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False)
data_set = ds.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False)
ds_equalize_c = data_set.map(operations=C.Equalize(), input_columns="image")
ds_orig = ds.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False)


images = [] images = []
images_trans = [] images_trans = []
@@ -242,7 +242,7 @@ def test_equalize_md5_py():
logger.info("Test Equalize") logger.info("Test Equalize")


# First dataset # First dataset
data1 = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data1 = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
transforms = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), transforms = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(),
F.Equalize(), F.Equalize(),
F.ToTensor()]) F.ToTensor()])
@@ -260,14 +260,14 @@ def test_equalize_md5_c():
logger.info("Test Equalize cpp op with md5 check") logger.info("Test Equalize cpp op with md5 check")


# Generate dataset # Generate dataset
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)


transforms_equalize = [C.Decode(), transforms_equalize = [C.Decode(),
C.Resize(size=[224, 224]), C.Resize(size=[224, 224]),
C.Equalize(), C.Equalize(),
F.ToTensor()] F.ToTensor()]


data = ds.map(operations=transforms_equalize, input_columns="image")
data = data_set.map(operations=transforms_equalize, input_columns="image")
# Compare with expected md5 from images # Compare with expected md5 from images
filename = "equalize_01_result_c.npz" filename = "equalize_01_result_c.npz"
save_and_check_md5(data, filename, generate_golden=GENERATE_GOLDEN) save_and_check_md5(data, filename, generate_golden=GENERATE_GOLDEN)


+ 23
- 23
tests/ut/python/dataset/test_invert.py View File

@@ -17,7 +17,7 @@ Testing Invert op in DE
""" """
import numpy as np import numpy as np


import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.transforms.py_transforms import mindspore.dataset.transforms.py_transforms
import mindspore.dataset.vision.py_transforms as F import mindspore.dataset.vision.py_transforms as F
import mindspore.dataset.vision.c_transforms as C import mindspore.dataset.vision.c_transforms as C
@@ -36,13 +36,13 @@ def test_invert_py(plot=False):
logger.info("Test Invert Python op") logger.info("Test Invert Python op")


# Original Images # Original Images
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)


transforms_original = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), transforms_original = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(),
F.Resize((224, 224)), F.Resize((224, 224)),
F.ToTensor()]) F.ToTensor()])


ds_original = ds.map(operations=transforms_original, input_columns="image")
ds_original = data_set.map(operations=transforms_original, input_columns="image")


ds_original = ds_original.batch(512) ds_original = ds_original.batch(512)


@@ -55,14 +55,14 @@ def test_invert_py(plot=False):
axis=0) axis=0)


# Color Inverted Images # Color Inverted Images
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)


transforms_invert = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), transforms_invert = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(),
F.Resize((224, 224)), F.Resize((224, 224)),
F.Invert(), F.Invert(),
F.ToTensor()]) F.ToTensor()])


ds_invert = ds.map(operations=transforms_invert, input_columns="image")
ds_invert = data_set.map(operations=transforms_invert, input_columns="image")


ds_invert = ds_invert.batch(512) ds_invert = ds_invert.batch(512)


@@ -91,11 +91,11 @@ def test_invert_c(plot=False):
logger.info("Test Invert cpp op") logger.info("Test Invert cpp op")


# Original Images # Original Images
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)


transforms_original = [C.Decode(), C.Resize(size=[224, 224])] transforms_original = [C.Decode(), C.Resize(size=[224, 224])]


ds_original = ds.map(operations=transforms_original, input_columns="image")
ds_original = data_set.map(operations=transforms_original, input_columns="image")


ds_original = ds_original.batch(512) ds_original = ds_original.batch(512)


@@ -108,12 +108,12 @@ def test_invert_c(plot=False):
axis=0) axis=0)


# Invert Images # Invert Images
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)


transform_invert = [C.Decode(), C.Resize(size=[224, 224]), transform_invert = [C.Decode(), C.Resize(size=[224, 224]),
C.Invert()] C.Invert()]


ds_invert = ds.map(operations=transform_invert, input_columns="image")
ds_invert = data_set.map(operations=transform_invert, input_columns="image")


ds_invert = ds_invert.batch(512) ds_invert = ds_invert.batch(512)


@@ -141,10 +141,10 @@ def test_invert_py_c(plot=False):
logger.info("Test Invert cpp and python op") logger.info("Test Invert cpp and python op")


# Invert Images in cpp # Invert Images in cpp
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
ds = ds.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"])
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"])


ds_c_invert = ds.map(operations=C.Invert(), input_columns="image")
ds_c_invert = data_set.map(operations=C.Invert(), input_columns="image")


ds_c_invert = ds_c_invert.batch(512) ds_c_invert = ds_c_invert.batch(512)


@@ -157,15 +157,15 @@ def test_invert_py_c(plot=False):
axis=0) axis=0)


# invert images in python # invert images in python
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
ds = ds.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"])
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"])


transforms_p_invert = mindspore.dataset.transforms.py_transforms.Compose([lambda img: img.astype(np.uint8), transforms_p_invert = mindspore.dataset.transforms.py_transforms.Compose([lambda img: img.astype(np.uint8),
F.ToPIL(), F.ToPIL(),
F.Invert(), F.Invert(),
np.array]) np.array])


ds_p_invert = ds.map(operations=transforms_p_invert, input_columns="image")
ds_p_invert = data_set.map(operations=transforms_p_invert, input_columns="image")


ds_p_invert = ds_p_invert.batch(512) ds_p_invert = ds_p_invert.batch(512)


@@ -196,11 +196,11 @@ def test_invert_one_channel():
c_op = C.Invert() c_op = C.Invert()


try: try:
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
ds = ds.map(operations=[C.Decode(), C.Resize((224, 224)),
lambda img: np.array(img[:, :, 0])], input_columns=["image"])
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224)),
lambda img: np.array(img[:, :, 0])], input_columns=["image"])


ds.map(operations=c_op, input_columns="image")
data_set.map(operations=c_op, input_columns="image")


except RuntimeError as e: except RuntimeError as e:
logger.info("Got an exception in DE: {}".format(str(e))) logger.info("Got an exception in DE: {}".format(str(e)))
@@ -214,13 +214,13 @@ def test_invert_md5_py():
logger.info("Test Invert python op with md5 check") logger.info("Test Invert python op with md5 check")


# Generate dataset # Generate dataset
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)


transforms_invert = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), transforms_invert = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(),
F.Invert(), F.Invert(),
F.ToTensor()]) F.ToTensor()])


data = ds.map(operations=transforms_invert, input_columns="image")
data = data_set.map(operations=transforms_invert, input_columns="image")
# Compare with expected md5 from images # Compare with expected md5 from images
filename = "invert_01_result_py.npz" filename = "invert_01_result_py.npz"
save_and_check_md5(data, filename, generate_golden=GENERATE_GOLDEN) save_and_check_md5(data, filename, generate_golden=GENERATE_GOLDEN)
@@ -233,14 +233,14 @@ def test_invert_md5_c():
logger.info("Test Invert cpp op with md5 check") logger.info("Test Invert cpp op with md5 check")


# Generate dataset # Generate dataset
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)


transforms_invert = [C.Decode(), transforms_invert = [C.Decode(),
C.Resize(size=[224, 224]), C.Resize(size=[224, 224]),
C.Invert(), C.Invert(),
F.ToTensor()] F.ToTensor()]


data = ds.map(operations=transforms_invert, input_columns="image")
data = data_set.map(operations=transforms_invert, input_columns="image")
# Compare with expected md5 from images # Compare with expected md5 from images
filename = "invert_01_result_c.npz" filename = "invert_01_result_c.npz"
save_and_check_md5(data, filename, generate_golden=GENERATE_GOLDEN) save_and_check_md5(data, filename, generate_golden=GENERATE_GOLDEN)


+ 4
- 5
tests/ut/python/dataset/test_random_color.py View File

@@ -19,7 +19,6 @@ import numpy as np
import pytest import pytest


import mindspore.dataset as ds import mindspore.dataset as ds
import mindspore.dataset.engine as de
import mindspore.dataset.transforms.py_transforms import mindspore.dataset.transforms.py_transforms
import mindspore.dataset.vision.c_transforms as vision import mindspore.dataset.vision.c_transforms as vision
import mindspore.dataset.vision.py_transforms as F import mindspore.dataset.vision.py_transforms as F
@@ -44,7 +43,7 @@ def test_random_color_py(degrees=(0.1, 1.9), plot=False):
logger.info("Test RandomColor") logger.info("Test RandomColor")


# Original Images # Original Images
data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)


transforms_original = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), transforms_original = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(),
F.Resize((224, 224)), F.Resize((224, 224)),
@@ -63,7 +62,7 @@ def test_random_color_py(degrees=(0.1, 1.9), plot=False):
axis=0) axis=0)


# Random Color Adjusted Images # Random Color Adjusted Images
data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)


transforms_random_color = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), transforms_random_color = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(),
F.Resize((224, 224)), F.Resize((224, 224)),
@@ -146,7 +145,7 @@ def test_random_color_py_md5():
original_num_parallel_workers = config_get_set_num_parallel_workers(1) original_num_parallel_workers = config_get_set_num_parallel_workers(1)


# Generate dataset # Generate dataset
data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)


transforms = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), transforms = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(),
F.RandomColor((2.0, 2.5)), F.RandomColor((2.0, 2.5)),
@@ -234,7 +233,7 @@ def test_random_color_c_errors():
assert "degrees must be a sequence with length 2." in str(error_info.value) assert "degrees must be a sequence with length 2." in str(error_info.value)


# RandomColor Cpp Op will fail with one channel input # RandomColor Cpp Op will fail with one channel input
mnist_ds = de.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False)
mnist_ds = ds.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False)
mnist_ds = mnist_ds.map(operations=vision.RandomColor(), input_columns="image") mnist_ds = mnist_ds.map(operations=vision.RandomColor(), input_columns="image")


with pytest.raises(RuntimeError) as error_info: with pytest.raises(RuntimeError) as error_info:


+ 13
- 14
tests/ut/python/dataset/test_random_sharpness.py View File

@@ -17,7 +17,6 @@ Testing RandomSharpness op in DE
""" """
import numpy as np import numpy as np
import mindspore.dataset as ds import mindspore.dataset as ds
import mindspore.dataset.engine as de
import mindspore.dataset.transforms.py_transforms import mindspore.dataset.transforms.py_transforms
import mindspore.dataset.vision.py_transforms as F import mindspore.dataset.vision.py_transforms as F
import mindspore.dataset.vision.c_transforms as C import mindspore.dataset.vision.c_transforms as C
@@ -38,7 +37,7 @@ def test_random_sharpness_py(degrees=(0.7, 0.7), plot=False):
logger.info("Test RandomSharpness python op") logger.info("Test RandomSharpness python op")


# Original Images # Original Images
data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)


transforms_original = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), transforms_original = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(),
F.Resize((224, 224)), F.Resize((224, 224)),
@@ -57,7 +56,7 @@ def test_random_sharpness_py(degrees=(0.7, 0.7), plot=False):
axis=0) axis=0)


# Random Sharpness Adjusted Images # Random Sharpness Adjusted Images
data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)


py_op = F.RandomSharpness() py_op = F.RandomSharpness()
if degrees is not None: if degrees is not None:
@@ -108,7 +107,7 @@ def test_random_sharpness_py_md5():
transform = mindspore.dataset.transforms.py_transforms.Compose(transforms) transform = mindspore.dataset.transforms.py_transforms.Compose(transforms)


# Generate dataset # Generate dataset
data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data = data.map(operations=transform, input_columns=["image"]) data = data.map(operations=transform, input_columns=["image"])


# check results with md5 comparison # check results with md5 comparison
@@ -128,7 +127,7 @@ def test_random_sharpness_c(degrees=(1.6, 1.6), plot=False):
logger.info("Test RandomSharpness cpp op") logger.info("Test RandomSharpness cpp op")


# Original Images # Original Images
data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)


transforms_original = [C.Decode(), transforms_original = [C.Decode(),
C.Resize((224, 224))] C.Resize((224, 224))]
@@ -146,7 +145,7 @@ def test_random_sharpness_c(degrees=(1.6, 1.6), plot=False):
axis=0) axis=0)


# Random Sharpness Adjusted Images # Random Sharpness Adjusted Images
data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)


c_op = C.RandomSharpness() c_op = C.RandomSharpness()
if degrees is not None: if degrees is not None:
@@ -194,7 +193,7 @@ def test_random_sharpness_c_md5():
] ]


# Generate dataset # Generate dataset
data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data = data.map(operations=transforms, input_columns=["image"]) data = data.map(operations=transforms, input_columns=["image"])


# check results with md5 comparison # check results with md5 comparison
@@ -213,7 +212,7 @@ def test_random_sharpness_c_py(degrees=(1.0, 1.0), plot=False):
logger.info("Test RandomSharpness C and python Op") logger.info("Test RandomSharpness C and python Op")


# RandomSharpness Images # RandomSharpness Images
data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data = data.map(operations=[C.Decode(), C.Resize((200, 300))], input_columns=["image"]) data = data.map(operations=[C.Decode(), C.Resize((200, 300))], input_columns=["image"])


python_op = F.RandomSharpness(degrees) python_op = F.RandomSharpness(degrees)
@@ -236,7 +235,7 @@ def test_random_sharpness_c_py(degrees=(1.0, 1.0), plot=False):
image, image,
axis=0) axis=0)


data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data = data.map(operations=[C.Decode(), C.Resize((200, 300))], input_columns=["image"]) data = data.map(operations=[C.Decode(), C.Resize((200, 300))], input_columns=["image"])


ds_images_random_sharpness_c = data.map(operations=c_op, input_columns="image") ds_images_random_sharpness_c = data.map(operations=c_op, input_columns="image")
@@ -271,10 +270,10 @@ def test_random_sharpness_one_channel_c(degrees=(1.4, 1.4), plot=False):
if degrees is not None: if degrees is not None:
c_op = C.RandomSharpness(degrees) c_op = C.RandomSharpness(degrees)
# RandomSharpness Images # RandomSharpness Images
data = de.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False)
data = ds.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False)
ds_random_sharpness_c = data.map(operations=c_op, input_columns="image") ds_random_sharpness_c = data.map(operations=c_op, input_columns="image")
# Original images # Original images
data = de.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False)
data = ds.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False)


images = [] images = []
images_trans = [] images_trans = []
@@ -296,7 +295,7 @@ def test_random_sharpness_invalid_params():
""" """
logger.info("Test RandomSharpness with invalid input parameters.") logger.info("Test RandomSharpness with invalid input parameters.")
try: try:
data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data = data.map(operations=[C.Decode(), C.Resize((224, 224)), data = data.map(operations=[C.Decode(), C.Resize((224, 224)),
C.RandomSharpness(10)], input_columns=["image"]) C.RandomSharpness(10)], input_columns=["image"])
except TypeError as error: except TypeError as error:
@@ -304,7 +303,7 @@ def test_random_sharpness_invalid_params():
assert "tuple" in str(error) assert "tuple" in str(error)


try: try:
data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data = data.map(operations=[C.Decode(), C.Resize((224, 224)), data = data.map(operations=[C.Decode(), C.Resize((224, 224)),
C.RandomSharpness((-10, 10))], input_columns=["image"]) C.RandomSharpness((-10, 10))], input_columns=["image"])
except ValueError as error: except ValueError as error:
@@ -312,7 +311,7 @@ def test_random_sharpness_invalid_params():
assert "interval" in str(error) assert "interval" in str(error)


try: try:
data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data = data.map(operations=[C.Decode(), C.Resize((224, 224)), data = data.map(operations=[C.Decode(), C.Resize((224, 224)),
C.RandomSharpness((10, 5))], input_columns=["image"]) C.RandomSharpness((10, 5))], input_columns=["image"])
except ValueError as error: except ValueError as error:


+ 2
- 3
tests/ut/python/dataset/test_random_solarize_op.py View File

@@ -17,7 +17,6 @@ Testing RandomSolarizeOp op in DE
""" """
import pytest import pytest
import mindspore.dataset as ds import mindspore.dataset as ds
import mindspore.dataset.engine as de
import mindspore.dataset.vision.c_transforms as vision import mindspore.dataset.vision.c_transforms as vision
from mindspore import log as logger from mindspore import log as logger
from util import visualize_list, save_and_check_md5, config_get_set_seed, config_get_set_num_parallel_workers, \ from util import visualize_list, save_and_check_md5, config_get_set_seed, config_get_set_num_parallel_workers, \
@@ -78,8 +77,8 @@ def test_random_solarize_mnist(plot=False, run_golden=True):
Test RandomSolarize op with MNIST dataset (Grayscale images) Test RandomSolarize op with MNIST dataset (Grayscale images)
""" """


mnist_1 = de.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False)
mnist_2 = de.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False)
mnist_1 = ds.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False)
mnist_2 = ds.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False)
mnist_2 = mnist_2.map(operations=vision.RandomSolarize((0, 255)), input_columns="image") mnist_2 = mnist_2.map(operations=vision.RandomSolarize((0, 255)), input_columns="image")


images = [] images = []


+ 10
- 10
tests/ut/python/dataset/test_uniform_augment.py View File

@@ -18,7 +18,7 @@ Testing UniformAugment in DE
import numpy as np import numpy as np
import pytest import pytest


import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.transforms.py_transforms import mindspore.dataset.transforms.py_transforms
import mindspore.dataset.vision.c_transforms as C import mindspore.dataset.vision.c_transforms as C
import mindspore.dataset.vision.py_transforms as F import mindspore.dataset.vision.py_transforms as F
@@ -35,13 +35,13 @@ def test_uniform_augment(plot=False, num_ops=2):
logger.info("Test UniformAugment") logger.info("Test UniformAugment")


# Original Images # Original Images
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)


transforms_original = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), transforms_original = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(),
F.Resize((224, 224)), F.Resize((224, 224)),
F.ToTensor()]) F.ToTensor()])


ds_original = ds.map(operations=transforms_original, input_columns="image")
ds_original = data_set.map(operations=transforms_original, input_columns="image")


ds_original = ds_original.batch(512) ds_original = ds_original.batch(512)


@@ -54,7 +54,7 @@ def test_uniform_augment(plot=False, num_ops=2):
axis=0) axis=0)


# UniformAugment Images # UniformAugment Images
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)


transform_list = [F.RandomRotation(45), transform_list = [F.RandomRotation(45),
F.RandomColor(), F.RandomColor(),
@@ -70,7 +70,7 @@ def test_uniform_augment(plot=False, num_ops=2):
num_ops=num_ops), num_ops=num_ops),
F.ToTensor()]) F.ToTensor()])


ds_ua = ds.map(operations=transforms_ua, input_columns="image")
ds_ua = data_set.map(operations=transforms_ua, input_columns="image")


ds_ua = ds_ua.batch(512) ds_ua = ds_ua.batch(512)


@@ -99,12 +99,12 @@ def test_cpp_uniform_augment(plot=False, num_ops=2):
logger.info("Test CPP UniformAugment") logger.info("Test CPP UniformAugment")


# Original Images # Original Images
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)


transforms_original = [C.Decode(), C.Resize(size=[224, 224]), transforms_original = [C.Decode(), C.Resize(size=[224, 224]),
F.ToTensor()] F.ToTensor()]


ds_original = ds.map(operations=transforms_original, input_columns="image")
ds_original = data_set.map(operations=transforms_original, input_columns="image")


ds_original = ds_original.batch(512) ds_original = ds_original.batch(512)


@@ -117,7 +117,7 @@ def test_cpp_uniform_augment(plot=False, num_ops=2):
axis=0) axis=0)


# UniformAugment Images # UniformAugment Images
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
transforms_ua = [C.RandomCrop(size=[224, 224], padding=[32, 32, 32, 32]), transforms_ua = [C.RandomCrop(size=[224, 224], padding=[32, 32, 32, 32]),
C.RandomHorizontalFlip(), C.RandomHorizontalFlip(),
C.RandomVerticalFlip(), C.RandomVerticalFlip(),
@@ -130,7 +130,7 @@ def test_cpp_uniform_augment(plot=False, num_ops=2):
uni_aug, uni_aug,
F.ToTensor()] F.ToTensor()]


ds_ua = ds.map(operations=transforms_all, input_columns="image", num_parallel_workers=1)
ds_ua = data_set.map(operations=transforms_all, input_columns="image", num_parallel_workers=1)


ds_ua = ds_ua.batch(512) ds_ua = ds_ua.batch(512)


@@ -240,7 +240,7 @@ def test_cpp_uniform_augment_random_crop_badinput(num_ops=1):
logger.info("Test CPP UniformAugment with random_crop bad input") logger.info("Test CPP UniformAugment with random_crop bad input")
batch_size = 2 batch_size = 2
cifar10_dir = "../data/dataset/testCifar10Data" cifar10_dir = "../data/dataset/testCifar10Data"
ds1 = de.Cifar10Dataset(cifar10_dir, shuffle=False) # shape = [32,32,3]
ds1 = ds.Cifar10Dataset(cifar10_dir, shuffle=False) # shape = [32,32,3]


transforms_ua = [ transforms_ua = [
# Note: crop size [224, 224] > image size [32, 32] # Note: crop size [224, 224] > image size [32, 32]


Loading…
Cancel
Save