Browse Source

dataset: Fill op: C++ API support, UTs and Pybind decoupling

pull/14911/head
Cathy Wong 5 years ago
parent
commit
7e6a03487e
12 changed files with 725 additions and 128 deletions
  1. +0
    -1
      mindspore/ccsrc/minddata/dataset/api/CMakeLists.txt
  2. +0
    -39
      mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/kernels/data/bindings.cc
  3. +11
    -0
      mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/kernels/ir/bindings.cc
  4. +14
    -0
      mindspore/ccsrc/minddata/dataset/api/transforms.cc
  5. +24
    -0
      mindspore/ccsrc/minddata/dataset/include/transforms.h
  6. +24
    -0
      mindspore/ccsrc/minddata/dataset/kernels/ir/data/transforms_ir.cc
  7. +19
    -0
      mindspore/ccsrc/minddata/dataset/kernels/ir/data/transforms_ir.h
  8. +5
    -2
      mindspore/dataset/transforms/c_transforms.py
  9. +76
    -0
      tests/ut/cpp/dataset/c_api_dataset_randomdata_test.cc
  10. +429
    -1
      tests/ut/cpp/dataset/c_api_transforms_test.cc
  11. +14
    -0
      tests/ut/python/dataset/test_fill_op.py
  12. +109
    -85
      tests/ut/python/dataset/test_serdes_dataset.py

+ 0
- 1
mindspore/ccsrc/minddata/dataset/api/CMakeLists.txt View File

@@ -15,7 +15,6 @@ if(ENABLE_PYTHON)
python/bindings/dataset/engine/ir/execute/bindings.cc
python/bindings/dataset/engine/ir/schema/bindings.cc
python/bindings/dataset/kernels/bindings.cc
python/bindings/dataset/kernels/data/bindings.cc
python/bindings/dataset/kernels/ir/bindings.cc
python/bindings/dataset/kernels/ir/image/bindings.cc
python/bindings/dataset/text/bindings.cc


+ 0
- 39
mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/kernels/data/bindings.cc View File

@@ -1,39 +0,0 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "pybind11/pybind11.h"
#include "pybind11/stl.h"
#include "pybind11/stl_bind.h"

#include "minddata/dataset/api/python/pybind_register.h"
#include "minddata/dataset/kernels/data/fill_op.h"
#include "minddata/dataset/kernels/data/to_float16_op.h"

namespace mindspore {
namespace dataset {

PYBIND_REGISTER(
FillOp, 1, ([](const py::module *m) {
(void)py::class_<FillOp, TensorOp, std::shared_ptr<FillOp>>(*m, "FillOp").def(py::init<std::shared_ptr<Tensor>>());
}));

PYBIND_REGISTER(ToFloat16Op, 1, ([](const py::module *m) {
(void)py::class_<ToFloat16Op, TensorOp, std::shared_ptr<ToFloat16Op>>(*m, "ToFloat16Op",
py::dynamic_attr())
.def(py::init<>());
}));

} // namespace dataset
} // namespace mindspore

+ 11
- 0
mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/kernels/ir/bindings.cc View File

@@ -86,6 +86,17 @@ PYBIND_REGISTER(
}));
}));

PYBIND_REGISTER(FillOperation, 1, ([](const py::module *m) {
(void)
py::class_<transforms::FillOperation, TensorOperation, std::shared_ptr<transforms::FillOperation>>(
*m, "FillOperation")
.def(py::init([](std::shared_ptr<Tensor> fill_value) {
auto fill = std::make_shared<transforms::FillOperation>(fill_value);
THROW_IF_ERROR(fill->ValidateParams());
return fill;
}));
}));

PYBIND_REGISTER(MaskOperation, 1, ([](const py::module *m) {
(void)
py::class_<transforms::MaskOperation, TensorOperation, std::shared_ptr<transforms::MaskOperation>>(


+ 14
- 0
mindspore/ccsrc/minddata/dataset/api/transforms.cc View File

@@ -85,6 +85,20 @@ Duplicate::Duplicate() {}
std::shared_ptr<TensorOperation> Duplicate::Parse() { return std::make_shared<DuplicateOperation>(); }

#ifndef ENABLE_ANDROID
// Constructor to Fill
struct Fill::Data {
explicit Data(MSTensor fill_value) : fill_value_(fill_value) {}
MSTensor fill_value_;
};

Fill::Fill(MSTensor fill_value) : data_(std::make_shared<Data>(fill_value)) {}

std::shared_ptr<TensorOperation> Fill::Parse() {
std::shared_ptr<Tensor> out_fill_value;
Tensor::CreateFromMSTensor(data_->fill_value_, &out_fill_value);
return std::make_shared<FillOperation>(out_fill_value);
}

// Constructor to Mask
struct Mask::Data {
explicit Data(RelationalOp op, MSTensor constant, mindspore::DataType ms_type)


+ 24
- 0
mindspore/ccsrc/minddata/dataset/include/transforms.h View File

@@ -194,6 +194,30 @@ class Duplicate final : public TensorTransform {
std::shared_ptr<TensorOperation> Parse() override;
};

/// \brief Fill Op.
/// \notes Tensor operation to fill all elements in the tensor with the specified value.
/// The output tensor will have the same shape and type as the input tensor.
class Fill final : public TensorTransform {
public:
/// \brief Constructor.
/// \param[in] fill_value Scalar value to fill the tensor with.
/// Can only be MSTensor of the following types from mindspore::DataType:
/// String, Bool, Int8/16/32/64, UInt8/16/32/64, Float16/32/64.
explicit Fill(MSTensor fill_value);

/// \brief Destructor
~Fill() = default;

protected:
/// \brief Function to convert TensorTransform object into a TensorOperation object.
/// \return Shared pointer to TensorOperation object.
std::shared_ptr<TensorOperation> Parse() override;

private:
struct Data;
std::shared_ptr<Data> data_;
};

/// \brief Mask Op.
/// \notes Mask content of the input tensor with the given predicate.
/// Any element of the tensor that matches the predicate will be evaluated to True, otherwise False.


+ 24
- 0
mindspore/ccsrc/minddata/dataset/kernels/ir/data/transforms_ir.cc View File

@@ -26,6 +26,7 @@
#endif
#include "minddata/dataset/kernels/data/duplicate_op.h"
#ifndef ENABLE_ANDROID
#include "minddata/dataset/kernels/data/fill_op.h"
#include "minddata/dataset/kernels/data/mask_op.h"
#endif
#include "minddata/dataset/kernels/data/one_hot_op.h"
@@ -111,6 +112,29 @@ Status DuplicateOperation::ValidateParams() { return Status::OK(); }
std::shared_ptr<TensorOp> DuplicateOperation::Build() { return std::make_shared<DuplicateOp>(); }

#ifndef ENABLE_ANDROID

// FillOperation
FillOperation::FillOperation(std::shared_ptr<Tensor> fill_value) : fill_value_(fill_value) {}

Status FillOperation::ValidateParams() {
if (fill_value_->shape() != TensorShape::CreateScalar()) {
std::string err_msg = "Fill: fill_value is not a scalar tensor.";
MS_LOG(ERROR) << err_msg;
RETURN_STATUS_SYNTAX_ERROR(err_msg);
}

return Status::OK();
}

std::shared_ptr<TensorOp> FillOperation::Build() { return std::make_shared<FillOp>(fill_value_); }

Status FillOperation::to_json(nlohmann::json *out_json) {
nlohmann::json args;
args["fill_value"] = fill_value_->ToString();
*out_json = args;
return Status::OK();
}

// MaskOperation
MaskOperation::MaskOperation(RelationalOp op, const std::shared_ptr<Tensor> &constant, DataType dtype)
: op_(op), constant_(constant), dtype_(dtype) {}


+ 19
- 0
mindspore/ccsrc/minddata/dataset/kernels/ir/data/transforms_ir.h View File

@@ -31,6 +31,7 @@ namespace dataset {
constexpr char kComposeOperation[] = "Compose";
constexpr char kConcatenateOperation[] = "Concatenate";
constexpr char kDuplicateOperation[] = "Duplicate";
constexpr char kFillOperation[] = "Fill";
constexpr char kMaskOperation[] = "Mask";
constexpr char kOneHotOperation[] = "OneHot";
constexpr char kPadEndOperation[] = "PadEnd";
@@ -93,6 +94,24 @@ class DuplicateOperation : public TensorOperation {
std::string Name() const override { return kDuplicateOperation; }
};

class FillOperation : public TensorOperation {
public:
explicit FillOperation(std::shared_ptr<Tensor> fill_value);

~FillOperation() = default;

std::shared_ptr<TensorOp> Build() override;

Status ValidateParams() override;

std::string Name() const override { return kFillOperation; }

Status to_json(nlohmann::json *out_json) override;

private:
std::shared_ptr<Tensor> fill_value_;
};

class MaskOperation : public TensorOperation {
public:
explicit MaskOperation(RelationalOp op, const std::shared_ptr<Tensor> &constant, DataType dtype);


+ 5
- 2
mindspore/dataset/transforms/c_transforms.py View File

@@ -77,7 +77,7 @@ class OneHot(TensorOperation):
return cde.OneHotOperation(self.num_classes)


class Fill(cde.FillOp):
class Fill(TensorOperation):
"""
Tensor operation to fill all elements in the tensor with the specified value.
The output tensor will have the same shape and type as the input tensor.
@@ -101,7 +101,10 @@ class Fill(cde.FillOp):

@check_fill_value
def __init__(self, fill_value):
super().__init__(cde.Tensor(np.array(fill_value)))
self.fill_value = cde.Tensor(np.array(fill_value))

def parse(self):
return cde.FillOperation(self.fill_value)


class TypeCast(TensorOperation):


+ 76
- 0
tests/ut/cpp/dataset/c_api_dataset_randomdata_test.cc View File

@@ -475,6 +475,82 @@ TEST_F(MindDataTestPipeline, TestRandomDatasetBasic7) {
GlobalContext::config_manager()->set_seed(curr_seed);
}

TEST_F(MindDataTestPipeline, TestRandomDatasetUInt8) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRandomDatasetUInt8.";

// Create a RandomDataset with UInt8 numbers for given shape
u_int32_t curr_seed = GlobalContext::config_manager()->seed();
GlobalContext::config_manager()->set_seed(963);
std::shared_ptr<SchemaObj> schema = Schema();
schema->add_column("col1", mindspore::DataType::kNumberTypeUInt8, {4});
std::shared_ptr<Dataset> ds = RandomData(3, schema);
EXPECT_NE(ds, nullptr);
ds = ds->SetNumWorkers(3);
EXPECT_NE(ds, nullptr);

// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);

// Iterate the dataset and get each row
std::unordered_map<std::string, mindspore::MSTensor> row;
iter->GetNextRow(&row);

uint64_t i = 0;
while (row.size() != 0) {
auto ind = row["col1"];
TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);

iter->GetNextRow(&row);
i++;
}

EXPECT_EQ(i, 3);

// Manually terminate the pipeline
iter->Stop();
GlobalContext::config_manager()->set_seed(curr_seed);
}

TEST_F(MindDataTestPipeline, TestRandomDatasetFloat) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRandomDatasetFloat.";

// Create a RandomDataset with Float numbers for given shape
u_int32_t curr_seed = GlobalContext::config_manager()->seed();
GlobalContext::config_manager()->set_seed(369);
std::shared_ptr<SchemaObj> schema = Schema();
schema->add_column("col1", mindspore::DataType::kNumberTypeFloat16, {2, 3});
std::shared_ptr<Dataset> ds = RandomData(4, schema);
EXPECT_NE(ds, nullptr);
ds = ds->SetNumWorkers(2);
EXPECT_NE(ds, nullptr);

// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);

// Iterate the dataset and get each row
std::unordered_map<std::string, mindspore::MSTensor> row;
iter->GetNextRow(&row);

uint64_t i = 0;
while (row.size() != 0) {
auto ind = row["col1"];
TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);

iter->GetNextRow(&row);
i++;
}

EXPECT_EQ(i, 4);

// Manually terminate the pipeline
iter->Stop();
GlobalContext::config_manager()->set_seed(curr_seed);
}

TEST_F(MindDataTestPipeline, TestRandomDatasetDuplicateColumnName) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRandomDatasetDuplicateColumnName.";



+ 429
- 1
tests/ut/cpp/dataset/c_api_transforms_test.cc View File

@@ -241,6 +241,434 @@ TEST_F(MindDataTestPipeline, TestDuplicateSuccess) {
iter->Stop();
}

TEST_F(MindDataTestPipeline, TestFillSuccessInt) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFillSuccessInt.";

// Create a RandomDataset with Int32 numbers for given shape
u_int32_t curr_seed = GlobalContext::config_manager()->seed();
GlobalContext::config_manager()->set_seed(864);
std::shared_ptr<SchemaObj> schema = Schema();
schema->add_column("col1", mindspore::DataType::kNumberTypeInt32, {6});
std::shared_ptr<Dataset> ds = RandomData(5, schema);
EXPECT_NE(ds, nullptr);
ds = ds->SetNumWorkers(3);
EXPECT_NE(ds, nullptr);

// Create Fill op - to fill with 3
std::shared_ptr<Tensor> fill_value_tensor;
ASSERT_OK(Tensor::CreateScalar(3, &fill_value_tensor));
mindspore::MSTensor fill_value_MSTensor =
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(fill_value_tensor));
transforms::Fill mask = transforms::Fill(fill_value_MSTensor);
ds = ds->Map({mask}, {"col1"});
EXPECT_NE(ds, nullptr);

// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);

// Iterate the dataset and get each row
std::unordered_map<std::string, mindspore::MSTensor> row;
iter->GetNextRow(&row);

std::vector<std::vector<int32_t>> expected = {
{3, 3, 3, 3, 3, 3}, {3, 3, 3, 3, 3, 3}, {3, 3, 3, 3, 3, 3}, {3, 3, 3, 3, 3, 3}, {3, 3, 3, 3, 3, 3}};

uint64_t i = 0;
while (row.size() != 0) {
auto ind = row["col1"];
TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
std::shared_ptr<Tensor> de_expected_tensor;
ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
mindspore::MSTensor expected_tensor =
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
EXPECT_MSTENSOR_EQ(ind, expected_tensor);

iter->GetNextRow(&row);
i++;
}

EXPECT_EQ(i, 5);

// Manually terminate the pipeline
iter->Stop();
GlobalContext::config_manager()->set_seed(curr_seed);
}

TEST_F(MindDataTestPipeline, TestFillSuccessBool) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFillSuccessBool.";

// Create a RandomDataset with bool values for given shape
u_int32_t curr_seed = GlobalContext::config_manager()->seed();
GlobalContext::config_manager()->set_seed(963);
std::shared_ptr<SchemaObj> schema = Schema();
schema->add_column("col1", mindspore::DataType::kNumberTypeBool, {4});
std::shared_ptr<Dataset> ds = RandomData(3, schema);
EXPECT_NE(ds, nullptr);
ds = ds->SetNumWorkers(2);
EXPECT_NE(ds, nullptr);

// Create Fill op - to fill with zero
std::shared_ptr<Tensor> fill_value_tensor;
ASSERT_OK(Tensor::CreateScalar((bool)true, &fill_value_tensor));
mindspore::MSTensor fill_value_MSTensor =
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(fill_value_tensor));
transforms::Fill mask = transforms::Fill(fill_value_MSTensor);
ds = ds->Map({mask}, {"col1"});
EXPECT_NE(ds, nullptr);

// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);

// Iterate the dataset and get each row
std::unordered_map<std::string, mindspore::MSTensor> row;
iter->GetNextRow(&row);

std::vector<std::vector<bool>> expected = {
{true, true, true, true}, {true, true, true, true}, {true, true, true, true}};

uint64_t i = 0;
while (row.size() != 0) {
auto ind = row["col1"];
TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
std::shared_ptr<Tensor> de_expected_tensor;
ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
mindspore::MSTensor expected_tensor =
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
EXPECT_MSTENSOR_EQ(ind, expected_tensor);

iter->GetNextRow(&row);
i++;
}

EXPECT_EQ(i, 3);

// Manually terminate the pipeline
iter->Stop();
GlobalContext::config_manager()->set_seed(curr_seed);
}

TEST_F(MindDataTestPipeline, TestFillSuccessDownTypecast) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFillSuccessDownTypecast.";

// Create a RandomDataset with UInt8 numbers for given shape
u_int32_t curr_seed = GlobalContext::config_manager()->seed();
GlobalContext::config_manager()->set_seed(963);
std::shared_ptr<SchemaObj> schema = Schema();
schema->add_column("col1", mindspore::DataType::kNumberTypeUInt8, {4});
std::shared_ptr<Dataset> ds = RandomData(3, schema);
EXPECT_NE(ds, nullptr);
ds = ds->SetNumWorkers(2);
EXPECT_NE(ds, nullptr);

// Create Fill op - to fill with -3
std::shared_ptr<Tensor> fill_value_tensor;
ASSERT_OK(Tensor::CreateScalar(-3, &fill_value_tensor));
mindspore::MSTensor fill_value_MSTensor =
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(fill_value_tensor));
transforms::Fill mask = transforms::Fill(fill_value_MSTensor);
ds = ds->Map({mask}, {"col1"});
EXPECT_NE(ds, nullptr);

// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);

// Iterate the dataset and get each row
std::unordered_map<std::string, mindspore::MSTensor> row;
iter->GetNextRow(&row);

// Note: 2**8 -3 = 256 -3 = 253
std::vector<std::vector<uint8_t>> expected = {{253, 253, 253, 253}, {253, 253, 253, 253}, {253, 253, 253, 253}};

uint64_t i = 0;
while (row.size() != 0) {
auto ind = row["col1"];
TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
std::shared_ptr<Tensor> de_expected_tensor;
ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
mindspore::MSTensor expected_tensor =
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
EXPECT_MSTENSOR_EQ(ind, expected_tensor);

iter->GetNextRow(&row);
i++;
}

EXPECT_EQ(i, 3);

// Manually terminate the pipeline
iter->Stop();
GlobalContext::config_manager()->set_seed(curr_seed);
}

TEST_F(MindDataTestPipeline, TestFillSuccessDownTypecastZero) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFillSuccessDownTypecastZero.";

// Create a RandomDataset with UInt8 numbers for given shape
u_int32_t curr_seed = GlobalContext::config_manager()->seed();
GlobalContext::config_manager()->set_seed(963);
std::shared_ptr<SchemaObj> schema = Schema();
schema->add_column("col1", mindspore::DataType::kNumberTypeUInt8, {4});
std::shared_ptr<Dataset> ds = RandomData(3, schema);
EXPECT_NE(ds, nullptr);
ds = ds->SetNumWorkers(2);
EXPECT_NE(ds, nullptr);

// Create Fill op - to fill with zero
std::shared_ptr<Tensor> fill_value_tensor;
ASSERT_OK(Tensor::CreateScalar(0, &fill_value_tensor));
mindspore::MSTensor fill_value_MSTensor =
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(fill_value_tensor));
transforms::Fill mask = transforms::Fill(fill_value_MSTensor);
ds = ds->Map({mask}, {"col1"});
EXPECT_NE(ds, nullptr);

// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);

// Iterate the dataset and get each row
std::unordered_map<std::string, mindspore::MSTensor> row;
iter->GetNextRow(&row);

// Note: 2**8 = 256
std::vector<std::vector<uint8_t>> expected = {{0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}};

uint64_t i = 0;
while (row.size() != 0) {
auto ind = row["col1"];
TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
std::shared_ptr<Tensor> de_expected_tensor;
ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
mindspore::MSTensor expected_tensor =
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
EXPECT_MSTENSOR_EQ(ind, expected_tensor);

iter->GetNextRow(&row);
i++;
}

EXPECT_EQ(i, 3);

// Manually terminate the pipeline
iter->Stop();
GlobalContext::config_manager()->set_seed(curr_seed);
}

TEST_F(MindDataTestPipeline, TestFillSuccessDownTypecast16) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFillSuccessDownTypecast16.";

// Create a RandomDataset with UInt16 numbers for given shape
u_int32_t curr_seed = GlobalContext::config_manager()->seed();
GlobalContext::config_manager()->set_seed(963);
std::shared_ptr<SchemaObj> schema = Schema();
schema->add_column("col1", mindspore::DataType::kNumberTypeUInt16, {4});
std::shared_ptr<Dataset> ds = RandomData(3, schema);
EXPECT_NE(ds, nullptr);
ds = ds->SetNumWorkers(2);
EXPECT_NE(ds, nullptr);

// Create Fill op - to fill with -3
std::shared_ptr<Tensor> fill_value_tensor;
ASSERT_OK(Tensor::CreateScalar(-3, &fill_value_tensor));
mindspore::MSTensor fill_value_MSTensor =
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(fill_value_tensor));
transforms::Fill mask = transforms::Fill(fill_value_MSTensor);
ds = ds->Map({mask}, {"col1"});
EXPECT_NE(ds, nullptr);

// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);

// Iterate the dataset and get each row
std::unordered_map<std::string, mindspore::MSTensor> row;
iter->GetNextRow(&row);

// Note: 2**16 -3 = 65536 -3 = 65533
std::vector<std::vector<uint16_t>> expected = {
{65533, 65533, 65533, 65533}, {65533, 65533, 65533, 65533}, {65533, 65533, 65533, 65533}};

uint64_t i = 0;
while (row.size() != 0) {
auto ind = row["col1"];
TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
std::shared_ptr<Tensor> de_expected_tensor;
ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
mindspore::MSTensor expected_tensor =
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
EXPECT_MSTENSOR_EQ(ind, expected_tensor);

iter->GetNextRow(&row);
i++;
}

EXPECT_EQ(i, 3);

// Manually terminate the pipeline
iter->Stop();
GlobalContext::config_manager()->set_seed(curr_seed);
}

TEST_F(MindDataTestPipeline, TestFillSuccessUpTypecast) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFillSuccessUpTypecast.";

// Create a RandomDataset with Float numbers for given shape
u_int32_t curr_seed = GlobalContext::config_manager()->seed();
GlobalContext::config_manager()->set_seed(963);
std::shared_ptr<SchemaObj> schema = Schema();
schema->add_column("col1", mindspore::DataType::kNumberTypeFloat32, {2});
std::shared_ptr<Dataset> ds = RandomData((float)4.0, schema);
EXPECT_NE(ds, nullptr);
ds = ds->SetNumWorkers(2);
EXPECT_NE(ds, nullptr);

// Create Fill op - to fill with zeroes
std::shared_ptr<Tensor> fill_value_tensor;
ASSERT_OK(Tensor::CreateScalar(0, &fill_value_tensor));
mindspore::MSTensor fill_value_MSTensor =
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(fill_value_tensor));
transforms::Fill mask = transforms::Fill(fill_value_MSTensor);
ds = ds->Map({mask}, {"col1"});
EXPECT_NE(ds, nullptr);

// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);

// Iterate the dataset and get each row
std::unordered_map<std::string, mindspore::MSTensor> row;
iter->GetNextRow(&row);

std::vector<std::vector<float_t>> expected = {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}};

uint64_t i = 0;
while (row.size() != 0) {
auto ind = row["col1"];
TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
std::shared_ptr<Tensor> de_expected_tensor;
ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
mindspore::MSTensor expected_tensor =
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
EXPECT_MSTENSOR_EQ(ind, expected_tensor);

iter->GetNextRow(&row);
i++;
}

EXPECT_EQ(i, 4);

// Manually terminate the pipeline
iter->Stop();
GlobalContext::config_manager()->set_seed(curr_seed);
}

TEST_F(MindDataTestPipeline, TestFillSuccessString) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFillSuccessString.";

// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/testTokenizerData/basic_tokenizer.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);

// Create Skip operation on ds
ds = ds->Skip(6);
EXPECT_NE(ds, nullptr);

// Create BasicTokenizer operation on ds
std::shared_ptr<TensorTransform> basic_tokenizer = std::make_shared<text::BasicTokenizer>(true);
EXPECT_NE(basic_tokenizer, nullptr);

// Create Map operation on ds
ds = ds->Map({basic_tokenizer}, {"text"});
EXPECT_NE(ds, nullptr);

// Create Fill op - to fill with string
std::shared_ptr<Tensor> fill_value_tensor;
ASSERT_OK(Tensor::CreateScalar<std::string>("Hello", &fill_value_tensor));
mindspore::MSTensor fill_value_MSTensor =
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(fill_value_tensor));
transforms::Fill mask = transforms::Fill(fill_value_MSTensor);
ds = ds->Map({mask}, {"text"});
EXPECT_NE(ds, nullptr);

// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);

// Iterate the dataset and get each row
std::unordered_map<std::string, mindspore::MSTensor> row;
iter->GetNextRow(&row);

std::vector<std::string> expected = {"Hello", "Hello", "Hello", "Hello", "Hello"};
std::shared_ptr<Tensor> de_expected_tensor;
ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
mindspore::MSTensor expected_tensor =
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));

uint64_t i = 0;
while (row.size() != 0) {
auto ind = row["text"];
TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
EXPECT_MSTENSOR_EQ(ind, expected_tensor);
iter->GetNextRow(&row);
i++;
}

EXPECT_EQ(i, 1);

// Manually terminate the pipeline
iter->Stop();
}

TEST_F(MindDataTestPipeline, TestFillFailFillValueNotScalar) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFillFailFillValueNotScalar.";
// Test BasicTokenizer with lower_case true

// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/testTokenizerData/basic_tokenizer.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);

// Create Skip operation on ds
ds = ds->Skip(6);
EXPECT_NE(ds, nullptr);

// Create BasicTokenizer operation on ds
std::shared_ptr<TensorTransform> basic_tokenizer = std::make_shared<text::BasicTokenizer>(true);
EXPECT_NE(basic_tokenizer, nullptr);

// Create Map operation on ds
ds = ds->Map({basic_tokenizer}, {"text"});
EXPECT_NE(ds, nullptr);

// Create Fill op - with wrongful vector shape instead of scalar
std::vector<std::string> fill_string = {"ERROR"};
std::shared_ptr<Tensor> fill_value_tensor;
ASSERT_OK(Tensor::CreateFromVector(fill_string, &fill_value_tensor));
mindspore::MSTensor fill_value_MSTensor =
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(fill_value_tensor));
transforms::Fill mask = transforms::Fill(fill_value_MSTensor);
ds = ds->Map({mask}, {"text"});
EXPECT_NE(ds, nullptr);

// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();

// Expect failure: invalid Fill parameter (the shape of fill_value is not a scalar)
EXPECT_EQ(iter, nullptr);
}

TEST_F(MindDataTestPipeline, TestMaskSuccess) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestMaskSuccess.";

@@ -866,4 +1294,4 @@ TEST_F(MindDataTestPipeline, TestTypeCastFail) {
std::shared_ptr<Iterator> iter = ds->CreateIterator();
// Expect failure: invalid TypeCast input
EXPECT_EQ(iter, nullptr);
}
}

+ 14
- 0
tests/ut/python/dataset/test_fill_op.py View File

@@ -73,6 +73,19 @@ def test_fillop_string():
np.testing.assert_array_equal(data_row[0], expected)


def test_fillop_bytes():
def gen():
yield (np.array(["A", "B", "C"], dtype='S'),)

data = ds.GeneratorDataset(gen, column_names=["col"])
fill_op = data_trans.Fill(b'abc')

data = data.map(operations=fill_op, input_columns=["col"])
expected = np.array([b'abc', b'abc', b'abc'], dtype='S')
for data_row in data.create_tuple_iterator(output_numpy=True):
np.testing.assert_array_equal(data_row[0], expected)


def test_fillop_error_handling():
def gen():
yield (np.array([4, 4, 4, 4]),)
@@ -92,4 +105,5 @@ if __name__ == "__main__":
test_fillop_up_type_cast()
test_fillop_down_type_cast()
test_fillop_string()
test_fillop_bytes()
test_fillop_error_handling()

+ 109
- 85
tests/ut/python/dataset/test_serdes_dataset.py View File

@@ -145,90 +145,6 @@ def test_serdes_mnist_dataset(remove_json_files=True):
delete_json_files()


def test_serdes_zip_dataset(remove_json_files=True):
"""
Test serdes on zip dataset pipeline.
"""
files = ["../data/dataset/testTFTestAllTypes/test.data"]
schema_file = "../data/dataset/testTFTestAllTypes/datasetSchema.json"
ds.config.set_seed(1)

ds0 = ds.TFRecordDataset(files, schema=schema_file, shuffle=ds.Shuffle.GLOBAL)
data1 = ds.TFRecordDataset(files, schema=schema_file, shuffle=ds.Shuffle.GLOBAL)
data2 = ds.TFRecordDataset(files, schema=schema_file, shuffle=ds.Shuffle.FILES)
data2 = data2.shuffle(10000)
data2 = data2.rename(input_columns=["col_sint16", "col_sint32", "col_sint64", "col_float",
"col_1d", "col_2d", "col_3d", "col_binary"],
output_columns=["column_sint16", "column_sint32", "column_sint64", "column_float",
"column_1d", "column_2d", "column_3d", "column_binary"])
data3 = ds.zip((data1, data2))
ds.serialize(data3, "zip_dataset_pipeline.json")
assert validate_jsonfile("zip_dataset_pipeline.json") is True
assert validate_jsonfile("zip_dataset_pipeline_typo.json") is False

data4 = ds.deserialize(json_filepath="zip_dataset_pipeline.json")
ds.serialize(data4, "zip_dataset_pipeline_1.json")
assert validate_jsonfile("zip_dataset_pipeline_1.json") is True
assert filecmp.cmp('zip_dataset_pipeline.json', 'zip_dataset_pipeline_1.json')

rows = 0
for d0, d3, d4 in zip(ds0.create_tuple_iterator(output_numpy=True), data3.create_tuple_iterator(output_numpy=True),
data4.create_tuple_iterator(output_numpy=True)):
num_cols = len(d0)
offset = 0
for t1 in d0:
np.testing.assert_array_equal(t1, d3[offset])
np.testing.assert_array_equal(t1, d3[offset + num_cols])
np.testing.assert_array_equal(t1, d4[offset])
np.testing.assert_array_equal(t1, d4[offset + num_cols])
offset += 1
rows += 1
assert rows == 12

if remove_json_files:
delete_json_files()


def test_serdes_random_crop():
"""
Test serdes on RandomCrop pipeline.
"""
logger.info("test_random_crop")
DATA_DIR = ["../data/dataset/test_tf_file_3_images/train-0000-of-0001.data"]
SCHEMA_DIR = "../data/dataset/test_tf_file_3_images/datasetSchema.json"
original_seed = config_get_set_seed(1)
original_num_parallel_workers = config_get_set_num_parallel_workers(1)

# First dataset
data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"])
decode_op = vision.Decode()
random_crop_op = vision.RandomCrop([512, 512], [200, 200, 200, 200])
data1 = data1.map(operations=decode_op, input_columns="image")
data1 = data1.map(operations=random_crop_op, input_columns="image")

# Serializing into python dictionary
ds1_dict = ds.serialize(data1)
# Serializing into json object
_ = json.dumps(ds1_dict, indent=2)

# Reconstruct dataset pipeline from its serialized form
data1_1 = ds.deserialize(input_dict=ds1_dict)

# Second dataset
data2 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"])
data2 = data2.map(operations=decode_op, input_columns="image")

for item1, item1_1, item2 in zip(data1.create_dict_iterator(num_epochs=1, output_numpy=True),
data1_1.create_dict_iterator(num_epochs=1, output_numpy=True),
data2.create_dict_iterator(num_epochs=1, output_numpy=True)):
np.testing.assert_array_equal(item1['image'], item1_1['image'])
_ = item2["image"]

# Restore configuration num_parallel_workers
ds.config.set_seed(original_seed)
ds.config.set_num_parallel_workers(original_num_parallel_workers)


def test_serdes_cifar10_dataset(remove_json_files=True):
"""
Test serdes on Cifar10 dataset pipeline
@@ -351,6 +267,90 @@ def test_serdes_voc_dataset(remove_json_files=True):
ds.config.set_num_parallel_workers(original_num_parallel_workers)


def test_serdes_zip_dataset(remove_json_files=True):
"""
Test serdes on zip dataset pipeline.
"""
files = ["../data/dataset/testTFTestAllTypes/test.data"]
schema_file = "../data/dataset/testTFTestAllTypes/datasetSchema.json"
ds.config.set_seed(1)

ds0 = ds.TFRecordDataset(files, schema=schema_file, shuffle=ds.Shuffle.GLOBAL)
data1 = ds.TFRecordDataset(files, schema=schema_file, shuffle=ds.Shuffle.GLOBAL)
data2 = ds.TFRecordDataset(files, schema=schema_file, shuffle=ds.Shuffle.FILES)
data2 = data2.shuffle(10000)
data2 = data2.rename(input_columns=["col_sint16", "col_sint32", "col_sint64", "col_float",
"col_1d", "col_2d", "col_3d", "col_binary"],
output_columns=["column_sint16", "column_sint32", "column_sint64", "column_float",
"column_1d", "column_2d", "column_3d", "column_binary"])
data3 = ds.zip((data1, data2))
ds.serialize(data3, "zip_dataset_pipeline.json")
assert validate_jsonfile("zip_dataset_pipeline.json") is True
assert validate_jsonfile("zip_dataset_pipeline_typo.json") is False

data4 = ds.deserialize(json_filepath="zip_dataset_pipeline.json")
ds.serialize(data4, "zip_dataset_pipeline_1.json")
assert validate_jsonfile("zip_dataset_pipeline_1.json") is True
assert filecmp.cmp('zip_dataset_pipeline.json', 'zip_dataset_pipeline_1.json')

rows = 0
for d0, d3, d4 in zip(ds0.create_tuple_iterator(output_numpy=True), data3.create_tuple_iterator(output_numpy=True),
data4.create_tuple_iterator(output_numpy=True)):
num_cols = len(d0)
offset = 0
for t1 in d0:
np.testing.assert_array_equal(t1, d3[offset])
np.testing.assert_array_equal(t1, d3[offset + num_cols])
np.testing.assert_array_equal(t1, d4[offset])
np.testing.assert_array_equal(t1, d4[offset + num_cols])
offset += 1
rows += 1
assert rows == 12

if remove_json_files:
delete_json_files()


def test_serdes_random_crop():
"""
Test serdes on RandomCrop pipeline.
"""
logger.info("test_random_crop")
DATA_DIR = ["../data/dataset/test_tf_file_3_images/train-0000-of-0001.data"]
SCHEMA_DIR = "../data/dataset/test_tf_file_3_images/datasetSchema.json"
original_seed = config_get_set_seed(1)
original_num_parallel_workers = config_get_set_num_parallel_workers(1)

# First dataset
data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"])
decode_op = vision.Decode()
random_crop_op = vision.RandomCrop([512, 512], [200, 200, 200, 200])
data1 = data1.map(operations=decode_op, input_columns="image")
data1 = data1.map(operations=random_crop_op, input_columns="image")

# Serializing into python dictionary
ds1_dict = ds.serialize(data1)
# Serializing into json object
_ = json.dumps(ds1_dict, indent=2)

# Reconstruct dataset pipeline from its serialized form
data1_1 = ds.deserialize(input_dict=ds1_dict)

# Second dataset
data2 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"])
data2 = data2.map(operations=decode_op, input_columns="image")

for item1, item1_1, item2 in zip(data1.create_dict_iterator(num_epochs=1, output_numpy=True),
data1_1.create_dict_iterator(num_epochs=1, output_numpy=True),
data2.create_dict_iterator(num_epochs=1, output_numpy=True)):
np.testing.assert_array_equal(item1['image'], item1_1['image'])
_ = item2["image"]

# Restore configuration num_parallel_workers
ds.config.set_seed(original_seed)
ds.config.set_num_parallel_workers(original_num_parallel_workers)


def test_serdes_to_device(remove_json_files=True):
"""
Test serdes on transfer dataset pipeline.
@@ -405,6 +405,25 @@ def test_serdes_uniform_augment(remove_json_files=True):
util_check_serialize_deserialize_file(data, "uniform_augment_pipeline", remove_json_files)


def skip_test_serdes_fill(remove_json_files=True):
"""
Test serdes on Fill data transform.
"""
def gen():
yield (np.array([4, 5, 6, 7], dtype=np.int32),)

data = ds.GeneratorDataset(gen, column_names=["col"])
fill_op = c.Fill(3)

data = data.map(operations=fill_op, input_columns=["col"])
expected = np.array([3, 3, 3, 3], dtype=np.int32)
for data_row in data:
np.testing.assert_array_equal(data_row[0].asnumpy(), expected)

# FIXME - need proper serdes support for Fill's fill_value parameter
util_check_serialize_deserialize_file(data, "fill_pipeline", remove_json_files)


def test_serdes_exception():
"""
Test exception case in serdes
@@ -465,7 +484,7 @@ def delete_json_files():


# Test save load minddataset
def skip_test_minddataset(add_and_remove_cv_file):
def skip_test_minddataset(add_and_remove_cv_file=True):
"""tutorial for cv minderdataset."""
columns_list = ["data", "file_name", "label"]
num_readers = 4
@@ -504,4 +523,9 @@ if __name__ == '__main__':
test_serdes_voc_dataset()
test_serdes_zip_dataset()
test_serdes_random_crop()
test_serdes_to_device()
test_serdes_pyvision()
test_serdes_uniform_augment()
skip_test_serdes_fill()
test_serdes_exception()
skip_test_minddataset()

Loading…
Cancel
Save