/** * Copyright 2019 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include "dataset/api/de_pipeline.h" #include "dataset/engine/datasetops/source/cifar_op.h" #include "dataset/engine/datasetops/source/clue_op.h" #include "dataset/engine/datasetops/source/coco_op.h" #include "dataset/engine/datasetops/source/image_folder_op.h" #include "dataset/engine/datasetops/source/io_block.h" #include "dataset/engine/datasetops/source/manifest_op.h" #include "dataset/engine/datasetops/source/mindrecord_op.h" #include "dataset/engine/datasetops/source/mnist_op.h" #include "dataset/engine/datasetops/source/random_data_op.h" #include "dataset/engine/datasetops/source/sampler/distributed_sampler.h" #include "dataset/engine/datasetops/source/sampler/pk_sampler.h" #include "dataset/engine/datasetops/source/sampler/python_sampler.h" #include "dataset/engine/datasetops/source/sampler/random_sampler.h" #include "dataset/engine/datasetops/source/sampler/sequential_sampler.h" #include "dataset/engine/datasetops/source/sampler/subset_random_sampler.h" #include "dataset/engine/datasetops/source/sampler/weighted_random_sampler.h" #include "dataset/engine/datasetops/source/text_file_op.h" #include "dataset/engine/datasetops/source/tf_reader_op.h" #include "dataset/engine/datasetops/source/voc_op.h" #include "dataset/engine/gnn/graph.h" #include "dataset/engine/jagged_connector.h" #include "dataset/kernels/data/concatenate_op.h" #include "dataset/kernels/data/duplicate_op.h" #include "dataset/kernels/data/fill_op.h" #include "dataset/kernels/data/mask_op.h" #include "dataset/kernels/data/one_hot_op.h" #include "dataset/kernels/data/pad_end_op.h" #include "dataset/kernels/data/slice_op.h" #include "dataset/kernels/data/to_float16_op.h" #include "dataset/kernels/data/type_cast_op.h" #include "dataset/kernels/image/bounding_box_augment_op.h" #include "dataset/kernels/image/center_crop_op.h" #include "dataset/kernels/image/cut_out_op.h" #include "dataset/kernels/image/decode_op.h" #include "dataset/kernels/image/hwc_to_chw_op.h" #include "dataset/kernels/image/image_utils.h" #include "dataset/kernels/image/normalize_op.h" #include "dataset/kernels/image/pad_op.h" #include "dataset/kernels/image/random_color_adjust_op.h" #include "dataset/kernels/image/random_crop_and_resize_op.h" #include "dataset/kernels/image/random_crop_and_resize_with_bbox_op.h" #include "dataset/kernels/image/random_crop_decode_resize_op.h" #include "dataset/kernels/image/random_crop_op.h" #include "dataset/kernels/image/random_crop_with_bbox_op.h" #include "dataset/kernels/image/random_horizontal_flip_bbox_op.h" #include "dataset/kernels/image/random_horizontal_flip_op.h" #include "dataset/kernels/image/random_resize_op.h" #include "dataset/kernels/image/random_rotation_op.h" #include "dataset/kernels/image/random_vertical_flip_op.h" #include "dataset/kernels/image/random_vertical_flip_with_bbox_op.h" #include "dataset/kernels/image/rescale_op.h" #include "dataset/kernels/image/resize_bilinear_op.h" #include "dataset/kernels/image/resize_op.h" #include "dataset/kernels/image/uniform_aug_op.h" #include "dataset/kernels/no_op.h" #include "dataset/text/kernels/jieba_tokenizer_op.h" #include "dataset/text/kernels/lookup_op.h" #include "dataset/text/kernels/ngram_op.h" #include "dataset/text/kernels/to_number_op.h" #include "dataset/text/kernels/unicode_char_tokenizer_op.h" #include "dataset/text/kernels/wordpiece_tokenizer_op.h" #include "dataset/text/vocab.h" #include "dataset/util/random.h" #include "mindrecord/include/shard_distributed_sample.h" #include "mindrecord/include/shard_operator.h" #include "mindrecord/include/shard_pk_sample.h" #include "mindrecord/include/shard_sample.h" #include "mindrecord/include/shard_sequential_sample.h" #include "mindspore/ccsrc/dataset/text/kernels/truncate_sequence_pair_op.h" #include "pybind11/pybind11.h" #include "pybind11/stl.h" #include "pybind11/stl_bind.h" #ifdef ENABLE_ICU4C #include "dataset/text/kernels/basic_tokenizer_op.h" #include "dataset/text/kernels/bert_tokenizer_op.h" #include "dataset/text/kernels/case_fold_op.h" #include "dataset/text/kernels/normalize_utf8_op.h" #include "dataset/text/kernels/regex_replace_op.h" #include "dataset/text/kernels/regex_tokenizer_op.h" #include "dataset/text/kernels/unicode_script_tokenizer_op.h" #include "dataset/text/kernels/whitespace_tokenizer_op.h" #endif namespace py = pybind11; namespace mindspore { namespace dataset { #define THROW_IF_ERROR(s) \ do { \ Status rc = std::move(s); \ if (rc.IsError()) throw std::runtime_error(rc.ToString()); \ } while (false) void bindDEPipeline(py::module *m) { (void)py::class_(*m, "DEPipeline") .def(py::init<>()) .def( "AddNodeToTree", [](DEPipeline &de, const OpName &op_name, const py::dict &args) { DsOpPtr op; THROW_IF_ERROR(de.AddNodeToTree(op_name, args, &op)); return op; }, py::return_value_policy::reference) .def_static("AddChildToParentNode", [](const DsOpPtr &child_op, const DsOpPtr &parent_op) { THROW_IF_ERROR(DEPipeline::AddChildToParentNode(child_op, parent_op)); }) .def("AssignRootNode", [](DEPipeline &de, const DsOpPtr &dataset_op) { THROW_IF_ERROR(de.AssignRootNode(dataset_op)); }) .def("SetBatchParameters", [](DEPipeline &de, const py::dict &args) { THROW_IF_ERROR(de.SetBatchParameters(args)); }) .def("LaunchTreeExec", [](DEPipeline &de) { THROW_IF_ERROR(de.LaunchTreeExec()); }) .def("GetNextAsMap", [](DEPipeline &de) { py::dict out; THROW_IF_ERROR(de.GetNextAsMap(&out)); return out; }) .def("GetNextAsList", [](DEPipeline &de) { py::list out; THROW_IF_ERROR(de.GetNextAsList(&out)); return out; }) .def("GetOutputShapes", [](DEPipeline &de) { py::list out; THROW_IF_ERROR(de.GetOutputShapes(&out)); return out; }) .def("GetOutputTypes", [](DEPipeline &de) { py::list out; THROW_IF_ERROR(de.GetOutputTypes(&out)); return out; }) .def("GetDatasetSize", &DEPipeline::GetDatasetSize) .def("GetBatchSize", &DEPipeline::GetBatchSize) .def("GetNumClasses", &DEPipeline::GetNumClasses) .def("GetRepeatCount", &DEPipeline::GetRepeatCount); } void bindDatasetOps(py::module *m) { (void)py::class_>(*m, "TFReaderOp") .def_static("get_num_rows", [](const py::list &files, int64_t numParallelWorkers, bool estimate = false) { int64_t count = 0; std::vector filenames; for (auto l : files) { !l.is_none() ? filenames.push_back(py::str(l)) : (void)filenames.emplace_back(""); } THROW_IF_ERROR(TFReaderOp::CountTotalRows(&count, filenames, numParallelWorkers, estimate)); return count; }); (void)py::class_>(*m, "CifarOp") .def_static("get_num_rows", [](const std::string &dir, bool isCifar10) { int64_t count = 0; THROW_IF_ERROR(CifarOp::CountTotalRows(dir, isCifar10, &count)); return count; }); (void)py::class_>(*m, "ImageFolderOp") .def_static("get_num_rows_and_classes", [](const std::string &path) { int64_t count = 0, num_classes = 0; THROW_IF_ERROR(ImageFolderOp::CountRowsAndClasses(path, std::set{}, &count, &num_classes)); return py::make_tuple(count, num_classes); }); (void)py::class_>(*m, "MindRecordOp") .def_static("get_num_rows", [](const std::vector &paths, bool load_dataset, const py::object &sampler, const int64_t num_padded) { int64_t count = 0; std::shared_ptr op; if (py::hasattr(sampler, "create_for_minddataset")) { auto create = sampler.attr("create_for_minddataset"); op = create().cast>(); } THROW_IF_ERROR(MindRecordOp::CountTotalRows(paths, load_dataset, op, &count, num_padded)); return count; }); (void)py::class_>(*m, "ManifestOp") .def_static("get_num_rows_and_classes", [](const std::string &file, const py::dict &dict, const std::string &usage) { int64_t count = 0, num_classes = 0; THROW_IF_ERROR(ManifestOp::CountTotalRows(file, dict, usage, &count, &num_classes)); return py::make_tuple(count, num_classes); }) .def_static("get_class_indexing", [](const std::string &file, const py::dict &dict, const std::string &usage) { std::map output_class_indexing; THROW_IF_ERROR(ManifestOp::GetClassIndexing(file, dict, usage, &output_class_indexing)); return output_class_indexing; }); (void)py::class_>(*m, "MnistOp") .def_static("get_num_rows", [](const std::string &dir) { int64_t count = 0; THROW_IF_ERROR(MnistOp::CountTotalRows(dir, &count)); return count; }); (void)py::class_>(*m, "TextFileOp") .def_static("get_num_rows", [](const py::list &files) { int64_t count = 0; std::vector filenames; for (auto file : files) { !file.is_none() ? filenames.push_back(py::str(file)) : (void)filenames.emplace_back(""); } THROW_IF_ERROR(TextFileOp::CountAllFileRows(filenames, &count)); return count; }); (void)py::class_>(*m, "ClueOp") .def_static("get_num_rows", [](const py::list &files) { int64_t count = 0; std::vector filenames; for (auto file : files) { file.is_none() ? (void)filenames.emplace_back("") : filenames.push_back(py::str(file)); } THROW_IF_ERROR(ClueOp::CountAllFileRows(filenames, &count)); return count; }); (void)py::class_>(*m, "VOCOp") .def_static("get_num_rows", [](const std::string &dir, const std::string &task_type, const std::string &task_mode, const py::dict &dict, int64_t numSamples) { int64_t count = 0; THROW_IF_ERROR(VOCOp::CountTotalRows(dir, task_type, task_mode, dict, &count)); return count; }) .def_static("get_class_indexing", [](const std::string &dir, const std::string &task_type, const std::string &task_mode, const py::dict &dict) { std::map output_class_indexing; THROW_IF_ERROR(VOCOp::GetClassIndexing(dir, task_type, task_mode, dict, &output_class_indexing)); return output_class_indexing; }); (void)py::class_>(*m, "CocoOp") .def_static("get_class_indexing", [](const std::string &dir, const std::string &file, const std::string &task) { std::vector>> output_class_indexing; THROW_IF_ERROR(CocoOp::GetClassIndexing(dir, file, task, &output_class_indexing)); return output_class_indexing; }) .def_static("get_num_rows", [](const std::string &dir, const std::string &file, const std::string &task) { int64_t count = 0; THROW_IF_ERROR(CocoOp::CountTotalRows(dir, file, task, &count)); return count; }); } void bindTensor(py::module *m) { (void)py::class_(*m, "GlobalContext") .def_static("config_manager", &GlobalContext::config_manager, py::return_value_policy::reference); (void)py::class_>(*m, "ConfigManager") .def("__str__", &ConfigManager::ToString) .def("set_rows_per_buffer", &ConfigManager::set_rows_per_buffer) .def("set_num_parallel_workers", &ConfigManager::set_num_parallel_workers) .def("set_worker_connector_size", &ConfigManager::set_worker_connector_size) .def("set_op_connector_size", &ConfigManager::set_op_connector_size) .def("set_seed", &ConfigManager::set_seed) .def("set_monitor_sampling_interval", &ConfigManager::set_monitor_sampling_interval) .def("get_rows_per_buffer", &ConfigManager::rows_per_buffer) .def("get_num_parallel_workers", &ConfigManager::num_parallel_workers) .def("get_worker_connector_size", &ConfigManager::worker_connector_size) .def("get_op_connector_size", &ConfigManager::op_connector_size) .def("get_seed", &ConfigManager::seed) .def("get_monitor_sampling_interval", &ConfigManager::monitor_sampling_interval) .def("load", [](ConfigManager &c, std::string s) { THROW_IF_ERROR(c.LoadFile(s)); }); (void)py::class_>(*m, "Tensor", py::buffer_protocol()) .def(py::init([](py::array arr) { std::shared_ptr out; THROW_IF_ERROR(Tensor::CreateTensor(&out, arr)); return out; })) .def_buffer([](Tensor &tensor) { py::buffer_info info; THROW_IF_ERROR(Tensor::GetBufferInfo(tensor, &info)); return info; }) .def("__str__", &Tensor::ToString) .def("shape", &Tensor::shape) .def("type", &Tensor::type) .def("as_array", [](py::object &t) { auto &tensor = py::cast(t); if (tensor.type() == DataType::DE_STRING) { py::array res; tensor.GetDataAsNumpyStrings(&res); return res; } py::buffer_info info; THROW_IF_ERROR(Tensor::GetBufferInfo(tensor, &info)); return py::array(pybind11::dtype(info), info.shape, info.strides, info.ptr, t); }); (void)py::class_(*m, "TensorShape") .def(py::init()) .def("__str__", &TensorShape::ToString) .def("as_list", &TensorShape::AsPyList) .def("is_known", &TensorShape::known); (void)py::class_(*m, "DataType") .def(py::init()) .def(py::self == py::self) .def("__str__", &DataType::ToString) .def("__deepcopy__", [](py::object &t, py::dict memo) { return t; }); } void bindTensorOps1(py::module *m) { (void)py::class_>(*m, "TensorOp") .def("__deepcopy__", [](py::object &t, py::dict memo) { return t; }); (void)py::class_>( *m, "NormalizeOp", "Tensor operation to normalize an image. Takes mean and std.") .def(py::init(), py::arg("meanR"), py::arg("meanG"), py::arg("meanB"), py::arg("stdR"), py::arg("stdG"), py::arg("stdB")); (void)py::class_>( *m, "RescaleOp", "Tensor operation to rescale an image. Takes scale and shift.") .def(py::init(), py::arg("rescale"), py::arg("shift")); (void)py::class_>( *m, "CenterCropOp", "Tensor operation to crop and image in the middle. Takes height and width (optional)") .def(py::init(), py::arg("height"), py::arg("width") = CenterCropOp::kDefWidth); (void)py::class_>( *m, "ResizeOp", "Tensor operation to resize an image. Takes height, width and mode") .def(py::init(), py::arg("targetHeight"), py::arg("targetWidth") = ResizeOp::kDefWidth, py::arg("interpolation") = ResizeOp::kDefInterpolation); (void)py::class_>( *m, "UniformAugOp", "Tensor operation to apply random augmentation(s).") .def(py::init>, int32_t>(), py::arg("operations"), py::arg("NumOps") = UniformAugOp::kDefNumOps); (void)py::class_>( *m, "BoundingBoxAugmentOp", "Tensor operation to apply a transformation on a random choice of bounding boxes.") .def(py::init, float>(), py::arg("transform"), py::arg("ratio") = BoundingBoxAugmentOp::kDefRatio); (void)py::class_>( *m, "ResizeBilinearOp", "Tensor operation to resize an image using " "Bilinear mode. Takes height and width.") .def(py::init(), py::arg("targetHeight"), py::arg("targetWidth") = ResizeBilinearOp::kDefWidth); (void)py::class_>(*m, "DecodeOp", "Tensor operation to decode a jpg image") .def(py::init<>()) .def(py::init(), py::arg("rgb_format") = DecodeOp::kDefRgbFormat); (void)py::class_>( *m, "RandomHorizontalFlipOp", "Tensor operation to randomly flip an image horizontally.") .def(py::init(), py::arg("probability") = RandomHorizontalFlipOp::kDefProbability); (void)py::class_>( *m, "RandomHorizontalFlipWithBBoxOp", "Tensor operation to randomly flip an image horizontally, while flipping bounding boxes.") .def(py::init(), py::arg("probability") = RandomHorizontalFlipWithBBoxOp::kDefProbability); } void bindTensorOps2(py::module *m) { (void)py::class_>( *m, "RandomVerticalFlipOp", "Tensor operation to randomly flip an image vertically.") .def(py::init(), py::arg("probability") = RandomVerticalFlipOp::kDefProbability); (void)py::class_>( *m, "RandomVerticalFlipWithBBoxOp", "Tensor operation to randomly flip an image vertically" " and adjust bounding boxes.") .def(py::init(), py::arg("probability") = RandomVerticalFlipWithBBoxOp::kDefProbability); (void)py::class_>(*m, "RandomCropOp", "Gives random crop of specified size " "Takes crop size") .def(py::init(), py::arg("cropHeight"), py::arg("cropWidth"), py::arg("padTop") = RandomCropOp::kDefPadTop, py::arg("padBottom") = RandomCropOp::kDefPadBottom, py::arg("padLeft") = RandomCropOp::kDefPadLeft, py::arg("padRight") = RandomCropOp::kDefPadRight, py::arg("borderType") = RandomCropOp::kDefBorderType, py::arg("padIfNeeded") = RandomCropOp::kDefPadIfNeeded, py::arg("fillR") = RandomCropOp::kDefFillR, py::arg("fillG") = RandomCropOp::kDefFillG, py::arg("fillB") = RandomCropOp::kDefFillB); (void)py::class_>(*m, "ChannelSwapOp").def(py::init<>()); (void)py::class_>(*m, "RandomCropWithBBoxOp", "Gives random crop of given " "size + adjusts bboxes " "Takes crop size") .def(py::init(), py::arg("cropHeight"), py::arg("cropWidth"), py::arg("padTop") = RandomCropWithBBoxOp::kDefPadTop, py::arg("padBottom") = RandomCropWithBBoxOp::kDefPadBottom, py::arg("padLeft") = RandomCropWithBBoxOp::kDefPadLeft, py::arg("padRight") = RandomCropWithBBoxOp::kDefPadRight, py::arg("borderType") = RandomCropWithBBoxOp::kDefBorderType, py::arg("padIfNeeded") = RandomCropWithBBoxOp::kDefPadIfNeeded, py::arg("fillR") = RandomCropWithBBoxOp::kDefFillR, py::arg("fillG") = RandomCropWithBBoxOp::kDefFillG, py::arg("fillB") = RandomCropWithBBoxOp::kDefFillB); (void)py::class_>( *m, "OneHotOp", "Tensor operation to apply one hot encoding. Takes number of classes.") .def(py::init()); (void)py::class_>( *m, "FillOp", "Tensor operation to return tensor filled with same value as input fill value.") .def(py::init>()); (void)py::class_>(*m, "SliceOp", "Tensor slice operation.") .def(py::init()) .def(py::init([](const py::list &py_list) { std::vector c_list; for (auto l : py_list) { if (!l.is_none()) { c_list.push_back(py::reinterpret_borrow(l)); } } return std::make_shared(c_list); })) .def(py::init([](const py::tuple &py_slice) { if (py_slice.size() != 3) { THROW_IF_ERROR(Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, "Wrong slice object")); } Slice c_slice; if (!py_slice[0].is_none() && !py_slice[1].is_none() && !py_slice[2].is_none()) { c_slice = Slice(py::reinterpret_borrow(py_slice[0]), py::reinterpret_borrow(py_slice[1]), py::reinterpret_borrow(py_slice[2])); } else if (py_slice[0].is_none() && py_slice[2].is_none()) { c_slice = Slice(py::reinterpret_borrow(py_slice[1])); } else if (!py_slice[0].is_none() && !py_slice[1].is_none()) { c_slice = Slice(py::reinterpret_borrow(py_slice[0]), py::reinterpret_borrow(py_slice[1])); } if (!c_slice.valid()) { THROW_IF_ERROR(Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, "Wrong slice object")); } return std::make_shared(c_slice); })); (void)py::enum_(*m, "RelationalOp", py::arithmetic()) .value("EQ", RelationalOp::kEqual) .value("NE", RelationalOp::kNotEqual) .value("LT", RelationalOp::kLess) .value("LE", RelationalOp::kLessEqual) .value("GT", RelationalOp::kGreater) .value("GE", RelationalOp::kGreaterEqual) .export_values(); (void)py::class_>(*m, "MaskOp", "Tensor mask operation using relational comparator") .def(py::init, DataType>()); (void)py::class_>(*m, "DuplicateOp", "Duplicate tensor.") .def(py::init<>()); (void)py::class_>( *m, "TruncateSequencePairOp", "Tensor operation to truncate two tensors to a max_length") .def(py::init()); (void)py::class_>(*m, "ConcatenateOp", "Tensor operation concatenate tensors.") .def(py::init, std::shared_ptr>(), py::arg("axis"), py::arg("prepend").none(true), py::arg("append").none(true)); (void)py::class_>( *m, "RandomRotationOp", "Tensor operation to apply RandomRotation." "Takes a range for degrees and " "optional parameters for rotation center and image expand") .def(py::init(), py::arg("startDegree"), py::arg("endDegree"), py::arg("centerX") = RandomRotationOp::kDefCenterX, py::arg("centerY") = RandomRotationOp::kDefCenterY, py::arg("interpolation") = RandomRotationOp::kDefInterpolation, py::arg("expand") = RandomRotationOp::kDefExpand, py::arg("fillR") = RandomRotationOp::kDefFillR, py::arg("fillG") = RandomRotationOp::kDefFillG, py::arg("fillB") = RandomRotationOp::kDefFillB); (void)py::class_>( *m, "PadEndOp", "Tensor operation to pad end of tensor with a pad value.") .def(py::init>()); } void bindTensorOps3(py::module *m) { (void)py::class_>( *m, "RandomCropAndResizeOp", "Tensor operation to randomly crop an image and resize to a given size." "Takes output height and width and" "optional parameters for lower and upper bound for aspect ratio (h/w) and scale," "interpolation mode, and max attempts to crop") .def(py::init(), py::arg("targetHeight"), py::arg("targetWidth"), py::arg("scaleLb") = RandomCropAndResizeOp::kDefScaleLb, py::arg("scaleUb") = RandomCropAndResizeOp::kDefScaleUb, py::arg("aspectLb") = RandomCropAndResizeOp::kDefAspectLb, py::arg("aspectUb") = RandomCropAndResizeOp::kDefAspectUb, py::arg("interpolation") = RandomCropAndResizeOp::kDefInterpolation, py::arg("maxIter") = RandomCropAndResizeOp::kDefMaxIter); (void)py::class_>( *m, "RandomCropAndResizeWithBBoxOp", "Tensor operation to randomly crop an image (with BBoxes) and resize to a given size." "Takes output height and width and" "optional parameters for lower and upper bound for aspect ratio (h/w) and scale," "interpolation mode, and max attempts to crop") .def(py::init(), py::arg("targetHeight"), py::arg("targetWidth"), py::arg("scaleLb") = RandomCropAndResizeWithBBoxOp::kDefScaleLb, py::arg("scaleUb") = RandomCropAndResizeWithBBoxOp::kDefScaleUb, py::arg("aspectLb") = RandomCropAndResizeWithBBoxOp::kDefAspectLb, py::arg("aspectUb") = RandomCropAndResizeWithBBoxOp::kDefAspectUb, py::arg("interpolation") = RandomCropAndResizeWithBBoxOp::kDefInterpolation, py::arg("maxIter") = RandomCropAndResizeWithBBoxOp::kDefMaxIter); (void)py::class_>( *m, "RandomColorAdjustOp", "Tensor operation to adjust an image's color randomly." "Takes range for brightness, contrast, saturation, hue and") .def(py::init(), py::arg("bright_factor_start"), py::arg("bright_factor_end"), py::arg("contrast_factor_start"), py::arg("contrast_factor_end"), py::arg("saturation_factor_start"), py::arg("saturation_factor_end"), py::arg("hue_factor_start"), py::arg("hue_factor_end")); (void)py::class_>( *m, "RandomResizeOp", "Tensor operation to resize an image using a randomly selected interpolation. Takes height and width.") .def(py::init(), py::arg("targetHeight"), py::arg("targetWidth") = RandomResizeOp::kDefTargetWidth); (void)py::class_>( *m, "CutOutOp", "Tensor operation to randomly erase a portion of the image. Takes height and width.") .def(py::init(), py::arg("boxHeight"), py::arg("boxWidth"), py::arg("numPatches"), py::arg("randomColor") = CutOutOp::kDefRandomColor, py::arg("fillR") = CutOutOp::kDefFillR, py::arg("fillG") = CutOutOp::kDefFillG, py::arg("fillB") = CutOutOp::kDefFillB); } void bindTensorOps4(py::module *m) { (void)py::class_>( *m, "TypeCastOp", "Tensor operator to type cast data to a specified type.") .def(py::init(), py::arg("data_type")) .def(py::init(), py::arg("data_type")); (void)py::class_>(*m, "NoOp", "TensorOp that does nothing, for testing purposes only.") .def(py::init<>()); (void)py::class_>( *m, "ToFloat16Op", py::dynamic_attr(), "Tensor operator to type cast float32 data to a float16 type.") .def(py::init<>()); (void)py::class_>( *m, "RandomCropDecodeResizeOp", "equivalent to RandomCropAndResize but crops before decoding") .def(py::init(), py::arg("targetHeight"), py::arg("targetWidth"), py::arg("scaleLb") = RandomCropDecodeResizeOp::kDefScaleLb, py::arg("scaleUb") = RandomCropDecodeResizeOp::kDefScaleUb, py::arg("aspectLb") = RandomCropDecodeResizeOp::kDefAspectLb, py::arg("aspectUb") = RandomCropDecodeResizeOp::kDefAspectUb, py::arg("interpolation") = RandomCropDecodeResizeOp::kDefInterpolation, py::arg("maxIter") = RandomCropDecodeResizeOp::kDefMaxIter); (void)py::class_>( *m, "PadOp", "Pads image with specified color, default black, " "Takes amount to pad for top, bottom, left, right of image, boarder type and color") .def(py::init(), py::arg("padTop"), py::arg("padBottom"), py::arg("padLeft"), py::arg("padRight"), py::arg("borderTypes") = PadOp::kDefBorderType, py::arg("fillR") = PadOp::kDefFillR, py::arg("fillG") = PadOp::kDefFillG, py::arg("fillB") = PadOp::kDefFillB); (void)py::class_>(*m, "ToNumberOp", "TensorOp to convert strings to numbers.") .def(py::init(), py::arg("data_type")) .def(py::init(), py::arg("data_type")); } void bindTokenizerOps(py::module *m) { (void)py::class_>(*m, "JiebaTokenizerOp", "") .def(py::init(), py::arg("hmm_path"), py::arg("mp_path"), py::arg("mode") = JiebaMode::kMix) .def("add_word", [](JiebaTokenizerOp &self, const std::string word, int freq) { THROW_IF_ERROR(self.AddWord(word, freq)); }); (void)py::class_>( *m, "UnicodeCharTokenizerOp", "Tokenize a scalar tensor of UTF-8 string to Unicode characters.") .def(py::init<>()); (void)py::class_>(*m, "LookupOp", "Tensor operation to LookUp each word") .def(py::init, WordIdType>(), py::arg("vocab"), py::arg("unknown")) .def(py::init>(), py::arg("vocab")); (void)py::class_>(*m, "NgramOp", "TensorOp performs ngram mapping") .def(py::init &, int32_t, int32_t, const std::string &, const std::string &, const std::string &>(), py::arg("ngrams"), py::arg("l_pad_len"), py::arg("r_pad_len"), py::arg("l_pad_token"), py::arg("r_pad_token"), py::arg("separator")); (void)py::class_>( *m, "WordpieceTokenizerOp", "Tokenize scalar token or 1-D tokens to subword tokens.") .def(py::init &, const std::string &, const int &, const std::string &>(), py::arg("vocab"), py::arg("suffix_indicator") = std::string(WordpieceTokenizerOp::kDefSuffixIndicator), py::arg("max_bytes_per_token") = WordpieceTokenizerOp::kDefMaxBytesPerToken, py::arg("unknown_token") = std::string(WordpieceTokenizerOp::kDefUnknownToken)); } void bindDependIcuTokenizerOps(py::module *m) { #ifdef ENABLE_ICU4C (void)py::class_>( *m, "WhitespaceTokenizerOp", "Tokenize a scalar tensor of UTF-8 string on ICU defined whitespaces.") .def(py::init<>()); (void)py::class_>( *m, "UnicodeScriptTokenizerOp", "Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries.") .def(py::init<>()) .def(py::init(), py::arg("keep_whitespace") = UnicodeScriptTokenizerOp::kDefKeepWhitespace); (void)py::class_>( *m, "CaseFoldOp", "Apply case fold operation on utf-8 string tensor") .def(py::init<>()); (void)py::class_>( *m, "NormalizeUTF8Op", "Apply normalize operation on utf-8 string tensor.") .def(py::init<>()) .def(py::init(), py::arg("normalize_form") = NormalizeUTF8Op::kDefNormalizeForm); (void)py::class_>( *m, "RegexReplaceOp", "Replace utf-8 string tensor with 'replace' according to regular expression 'pattern'.") .def(py::init(), py::arg("pattern"), py::arg("replace"), py::arg("replace_all")); (void)py::class_>( *m, "RegexTokenizerOp", "Tokenize a scalar tensor of UTF-8 string by regex expression pattern.") .def(py::init(), py::arg("delim_pattern"), py::arg("keep_delim_pattern")); (void)py::class_>( *m, "BasicTokenizerOp", "Tokenize a scalar tensor of UTF-8 string by specific rules.") .def(py::init(), py::arg("lower_case") = BasicTokenizerOp::kDefLowerCase, py::arg("keep_whitespace") = BasicTokenizerOp::kDefKeepWhitespace, py::arg("normalization_form") = BasicTokenizerOp::kDefNormalizationForm, py::arg("preserve_unused_token") = BasicTokenizerOp::kDefPreserveUnusedToken); (void)py::class_>(*m, "BertTokenizerOp", "Tokenizer used for Bert text process.") .def(py::init &, const std::string &, const int &, const std::string &, bool, bool, NormalizeForm, bool>(), py::arg("vocab"), py::arg("suffix_indicator") = std::string(WordpieceTokenizerOp::kDefSuffixIndicator), py::arg("max_bytes_per_token") = WordpieceTokenizerOp::kDefMaxBytesPerToken, py::arg("unknown_token") = std::string(WordpieceTokenizerOp::kDefUnknownToken), py::arg("lower_case") = BasicTokenizerOp::kDefLowerCase, py::arg("keep_whitespace") = BasicTokenizerOp::kDefKeepWhitespace, py::arg("normalization_form") = BasicTokenizerOp::kDefNormalizationForm, py::arg("preserve_unused_token") = BasicTokenizerOp::kDefPreserveUnusedToken); #endif } void bindSamplerOps(py::module *m) { (void)py::class_>(*m, "Sampler") .def("set_num_rows", [](Sampler &self, int64_t rows) { THROW_IF_ERROR(self.SetNumRowsInDataset(rows)); }) .def("set_num_samples", [](Sampler &self, int64_t samples) { THROW_IF_ERROR(self.SetNumSamples(samples)); }) .def("initialize", [](Sampler &self) { THROW_IF_ERROR(self.InitSampler()); }) .def("get_indices", [](Sampler &self) { py::array ret; THROW_IF_ERROR(self.GetAllIdsThenReset(&ret)); return ret; }) .def("add_child", [](std::shared_ptr self, std::shared_ptr child) { THROW_IF_ERROR(self->AddChild(child)); }); (void)py::class_>(*m, "ShardOperator") .def("add_child", [](std::shared_ptr self, std::shared_ptr child) { self->SetChildOp(child); }); (void)py::class_>(*m, "DistributedSampler") .def(py::init()); (void)py::class_>(*m, "PKSampler") .def(py::init()); (void)py::class_>(*m, "RandomSampler") .def(py::init()); (void)py::class_>(*m, "SequentialSampler") .def(py::init()); (void)py::class_>(*m, "SubsetRandomSampler") .def(py::init>()); (void)py::class_>( *m, "MindrecordSubsetRandomSampler") .def(py::init, uint32_t>(), py::arg("indices"), py::arg("seed") = GetSeed()); (void)py::class_>( *m, "MindrecordPkSampler") .def(py::init([](int64_t kVal, std::string kColumn, bool shuffle) { if (shuffle == true) { return std::make_shared(kColumn, kVal, std::numeric_limits::max(), GetSeed()); } else { return std::make_shared(kColumn, kVal); } })); (void)py::class_>(*m, "MindrecordDistributedSampler") .def(py::init()); (void)py::class_>( *m, "MindrecordRandomSampler") .def(py::init([](int64_t num_samples, bool replacement, bool reshuffle_each_epoch) { return std::make_shared(GetSeed(), num_samples, replacement, reshuffle_each_epoch); })); (void)py::class_>(*m, "MindrecordSequentialSampler") .def(py::init([](int num_samples, int start_index) { return std::make_shared(num_samples, start_index); })); (void)py::class_>(*m, "WeightedRandomSampler") .def(py::init, bool>()); (void)py::class_>(*m, "PythonSampler") .def(py::init()); } void bindInfoObjects(py::module *m) { (void)py::class_(*m, "CBatchInfo") .def(py::init()) .def("get_epoch_num", &BatchOp::CBatchInfo::get_epoch_num) .def("get_batch_num", &BatchOp::CBatchInfo::get_batch_num); } void bindVocabObjects(py::module *m) { (void)py::class_>(*m, "Vocab") .def(py::init<>()) .def_static("from_list", [](const py::list &words, const py::list &special_tokens, bool special_first) { std::shared_ptr v; THROW_IF_ERROR(Vocab::BuildFromPyList(words, special_tokens, special_first, &v)); return v; }) .def_static("from_file", [](const std::string &path, const std::string &dlm, int32_t vocab_size, const py::list &special_tokens, bool special_first) { std::shared_ptr v; THROW_IF_ERROR(Vocab::BuildFromFile(path, dlm, vocab_size, special_tokens, special_first, &v)); return v; }) .def_static("from_dict", [](const py::dict &words) { std::shared_ptr v; THROW_IF_ERROR(Vocab::BuildFromPyDict(words, &v)); return v; }); } void bindGraphData(py::module *m) { (void)py::class_>(*m, "Graph") .def(py::init([](std::string dataset_file, int32_t num_workers) { std::shared_ptr g_out = std::make_shared(dataset_file, num_workers); THROW_IF_ERROR(g_out->Init()); return g_out; })) .def("get_all_nodes", [](gnn::Graph &g, gnn::NodeType node_type) { std::shared_ptr out; THROW_IF_ERROR(g.GetAllNodes(node_type, &out)); return out; }) .def("get_all_edges", [](gnn::Graph &g, gnn::EdgeType edge_type) { std::shared_ptr out; THROW_IF_ERROR(g.GetAllEdges(edge_type, &out)); return out; }) .def("get_nodes_from_edges", [](gnn::Graph &g, std::vector edge_list) { std::shared_ptr out; THROW_IF_ERROR(g.GetNodesFromEdges(edge_list, &out)); return out; }) .def("get_all_neighbors", [](gnn::Graph &g, std::vector node_list, gnn::NodeType neighbor_type) { std::shared_ptr out; THROW_IF_ERROR(g.GetAllNeighbors(node_list, neighbor_type, &out)); return out; }) .def("get_sampled_neighbors", [](gnn::Graph &g, std::vector node_list, std::vector neighbor_nums, std::vector neighbor_types) { std::shared_ptr out; THROW_IF_ERROR(g.GetSampledNeighbors(node_list, neighbor_nums, neighbor_types, &out)); return out; }) .def("get_neg_sampled_neighbors", [](gnn::Graph &g, std::vector node_list, gnn::NodeIdType neighbor_num, gnn::NodeType neg_neighbor_type) { std::shared_ptr out; THROW_IF_ERROR(g.GetNegSampledNeighbors(node_list, neighbor_num, neg_neighbor_type, &out)); return out; }) .def("get_node_feature", [](gnn::Graph &g, std::shared_ptr node_list, std::vector feature_types) { TensorRow out; THROW_IF_ERROR(g.GetNodeFeature(node_list, feature_types, &out)); return out.getRow(); }) .def("graph_info", [](gnn::Graph &g) { py::dict out; THROW_IF_ERROR(g.GraphInfo(&out)); return out; }) .def("random_walk", [](gnn::Graph &g, std::vector node_list, std::vector meta_path, float step_home_param, float step_away_param, gnn::NodeIdType default_node) { std::shared_ptr out; THROW_IF_ERROR(g.RandomWalk(node_list, meta_path, step_home_param, step_away_param, default_node, &out)); return out; }); } // This is where we externalize the C logic as python modules PYBIND11_MODULE(_c_dataengine, m) { m.doc() = "pybind11 for _c_dataengine"; (void)py::class_>(m, "DatasetOp"); (void)py::enum_(m, "OpName", py::arithmetic()) .value("SHUFFLE", OpName::kShuffle) .value("BATCH", OpName::kBatch) .value("BUCKETBATCH", OpName::kBucketBatch) .value("BARRIER", OpName::kBarrier) .value("MINDRECORD", OpName::kMindrecord) .value("CACHE", OpName::kCache) .value("REPEAT", OpName::kRepeat) .value("SKIP", OpName::kSkip) .value("TAKE", OpName::kTake) .value("ZIP", OpName::kZip) .value("CONCAT", OpName::kConcat) .value("MAP", OpName::kMap) .value("FILTER", OpName::kFilter) .value("DEVICEQUEUE", OpName::kDeviceQueue) .value("GENERATOR", OpName::kGenerator) .export_values() .value("RENAME", OpName::kRename) .value("TFREADER", OpName::kTfReader) .value("PROJECT", OpName::kProject) .value("IMAGEFOLDER", OpName::kImageFolder) .value("MNIST", OpName::kMnist) .value("MANIFEST", OpName::kManifest) .value("VOC", OpName::kVoc) .value("COCO", OpName::kCoco) .value("CIFAR10", OpName::kCifar10) .value("CIFAR100", OpName::kCifar100) .value("RANDOMDATA", OpName::kRandomData) .value("BUILDVOCAB", OpName::kBuildVocab) .value("CELEBA", OpName::kCelebA) .value("TEXTFILE", OpName::kTextFile) .value("CLUE", OpName::kClue); (void)py::enum_(m, "JiebaMode", py::arithmetic()) .value("DE_JIEBA_MIX", JiebaMode::kMix) .value("DE_JIEBA_MP", JiebaMode::kMp) .value("DE_JIEBA_HMM", JiebaMode::kHmm) .export_values(); #ifdef ENABLE_ICU4C (void)py::enum_(m, "NormalizeForm", py::arithmetic()) .value("DE_NORMALIZE_NONE", NormalizeForm::kNone) .value("DE_NORMALIZE_NFC", NormalizeForm::kNfc) .value("DE_NORMALIZE_NFKC", NormalizeForm::kNfkc) .value("DE_NORMALIZE_NFD", NormalizeForm::kNfd) .value("DE_NORMALIZE_NFKD", NormalizeForm::kNfkd) .export_values(); #endif (void)py::enum_(m, "InterpolationMode", py::arithmetic()) .value("DE_INTER_LINEAR", InterpolationMode::kLinear) .value("DE_INTER_CUBIC", InterpolationMode::kCubic) .value("DE_INTER_AREA", InterpolationMode::kArea) .value("DE_INTER_NEAREST_NEIGHBOUR", InterpolationMode::kNearestNeighbour) .export_values(); (void)py::enum_(m, "BorderType", py::arithmetic()) .value("DE_BORDER_CONSTANT", BorderType::kConstant) .value("DE_BORDER_EDGE", BorderType::kEdge) .value("DE_BORDER_REFLECT", BorderType::kReflect) .value("DE_BORDER_SYMMETRIC", BorderType::kSymmetric) .export_values(); bindDEPipeline(&m); bindTensor(&m); bindTensorOps1(&m); bindTensorOps2(&m); bindTensorOps3(&m); bindTensorOps4(&m); bindTokenizerOps(&m); bindSamplerOps(&m); bindDatasetOps(&m); bindInfoObjects(&m); bindVocabObjects(&m); bindGraphData(&m); bindDependIcuTokenizerOps(&m); } } // namespace dataset } // namespace mindspore