Browse Source

!15296 dataset: C++ API ToNumber & Lookup: change data_type parm from string to mindspore::DataType

From: @cathwong
Reviewed-by: @nsyca,@robingrosman
Signed-off-by: @robingrosman
pull/15296/MERGE
mindspore-ci-bot Gitee 4 years ago
parent
commit
1f516efc99
8 changed files with 197 additions and 45 deletions
  1. +15
    -9
      mindspore/ccsrc/minddata/dataset/api/text.cc
  2. +7
    -8
      mindspore/ccsrc/minddata/dataset/include/text.h
  3. +34
    -6
      mindspore/ccsrc/minddata/dataset/text/ir/kernels/text_ir.cc
  4. +9
    -4
      mindspore/ccsrc/minddata/dataset/text/ir/kernels/text_ir.h
  5. +6
    -0
      tests/ut/cpp/dataset/c_api_text_sentence_piece_vocab_test.cc
  6. +7
    -7
      tests/ut/cpp/dataset/c_api_text_test.cc
  7. +118
    -10
      tests/ut/cpp/dataset/c_api_text_vocab_test.cc
  8. +1
    -1
      tests/ut/python/dataset/test_vocab.py

+ 15
- 9
mindspore/ccsrc/minddata/dataset/api/text.cc View File

@@ -19,7 +19,8 @@
#include <regex> #include <regex>


#include "minddata/dataset/include/text.h" #include "minddata/dataset/include/text.h"

#include "mindspore/core/ir/dtype/type_id.h"
#include "minddata/dataset/core/type_id.h"
#include "minddata/dataset/text/ir/kernels/text_ir.h" #include "minddata/dataset/text/ir/kernels/text_ir.h"


namespace mindspore { namespace mindspore {
@@ -203,16 +204,20 @@ Status JiebaTokenizer::ParserFile(const std::string &file_path,
// Lookup // Lookup
struct Lookup::Data { struct Lookup::Data {
Data(const std::shared_ptr<Vocab> &vocab, const std::optional<std::vector<char>> &unknown_token, Data(const std::shared_ptr<Vocab> &vocab, const std::optional<std::vector<char>> &unknown_token,
const std::vector<char> &data_type)
: vocab_(vocab), unknown_token_(OptionalCharToString(unknown_token)), data_type_(CharToString(data_type)) {}
mindspore::DataType data_type)
: vocab_(vocab),
unknown_token_(OptionalCharToString(unknown_token)),
data_type_(dataset::MSTypeToDEType(static_cast<TypeId>(data_type))) {}
std::shared_ptr<Vocab> vocab_; std::shared_ptr<Vocab> vocab_;
std::optional<std::string> unknown_token_; std::optional<std::string> unknown_token_;
std::string data_type_;
dataset::DataType data_type_;
}; };


Lookup::Lookup(const std::shared_ptr<Vocab> &vocab, const std::optional<std::vector<char>> &unknown_token, Lookup::Lookup(const std::shared_ptr<Vocab> &vocab, const std::optional<std::vector<char>> &unknown_token,
const std::vector<char> &data_type)
: data_(std::make_shared<Data>(vocab, unknown_token, data_type)) {}
mindspore::DataType data_type)
: data_(std::make_shared<Data>(vocab, unknown_token, data_type)) {
data_->data_type_ = dataset::MSTypeToDEType(static_cast<TypeId>(data_type));
}


std::shared_ptr<TensorOperation> Lookup::Parse() { std::shared_ptr<TensorOperation> Lookup::Parse() {
return std::make_shared<LookupOperation>(data_->vocab_, data_->unknown_token_, data_->data_type_); return std::make_shared<LookupOperation>(data_->vocab_, data_->unknown_token_, data_->data_type_);
@@ -331,11 +336,12 @@ std::shared_ptr<TensorOperation> SlidingWindow::Parse() {


// ToNumber // ToNumber
struct ToNumber::Data { struct ToNumber::Data {
explicit Data(const std::vector<char> &data_type) : data_type_(CharToString(data_type)) {}
std::string data_type_;
dataset::DataType data_type_;
}; };


ToNumber::ToNumber(const std::vector<char> &data_type) : data_(std::make_shared<Data>(data_type)) {}
ToNumber::ToNumber(mindspore::DataType data_type) : data_(std::make_shared<Data>()) {
data_->data_type_ = dataset::MSTypeToDEType(static_cast<TypeId>(data_type));
}


std::shared_ptr<TensorOperation> ToNumber::Parse() { return std::make_shared<ToNumberOperation>(data_->data_type_); } std::shared_ptr<TensorOperation> ToNumber::Parse() { return std::make_shared<ToNumberOperation>(data_->data_type_); }




+ 7
- 8
mindspore/ccsrc/minddata/dataset/include/text.h View File

@@ -207,13 +207,14 @@ class Lookup final : public TensorTransform {
/// \param[in] unknown_token word to use for lookup if the word being looked up is out of Vocabulary (oov). /// \param[in] unknown_token word to use for lookup if the word being looked up is out of Vocabulary (oov).
/// If unknown_token is oov, runtime error will be thrown. If unknown_token is {}, which means that not to /// If unknown_token is oov, runtime error will be thrown. If unknown_token is {}, which means that not to
/// specify unknown_token when word being out of Vocabulary (default={}). /// specify unknown_token when word being out of Vocabulary (default={}).
/// \param[in] data_type type of the tensor after lookup, typically int32.
/// \param[in] data_type mindspore::DataType of the tensor after lookup; must be numeric, including bool.
/// (default=mindspore::DataType::kNumberTypeInt32).
explicit Lookup(const std::shared_ptr<Vocab> &vocab, const std::optional<std::string> &unknown_token = {}, explicit Lookup(const std::shared_ptr<Vocab> &vocab, const std::optional<std::string> &unknown_token = {},
const std::string &data_type = "int32")
: Lookup(vocab, OptionalStringToChar(unknown_token), StringToChar(data_type)) {}
mindspore::DataType data_type = mindspore::DataType::kNumberTypeInt32)
: Lookup(vocab, OptionalStringToChar(unknown_token), data_type) {}


explicit Lookup(const std::shared_ptr<Vocab> &vocab, const std::optional<std::vector<char>> &unknown_token, explicit Lookup(const std::shared_ptr<Vocab> &vocab, const std::optional<std::vector<char>> &unknown_token,
const std::vector<char> &data_type);
mindspore::DataType data_type = mindspore::DataType::kNumberTypeInt32);


/// \brief Destructor /// \brief Destructor
~Lookup() = default; ~Lookup() = default;
@@ -405,10 +406,8 @@ class SlidingWindow final : public TensorTransform {
class ToNumber final : public TensorTransform { class ToNumber final : public TensorTransform {
public: public:
/// \brief Constructor. /// \brief Constructor.
/// \param[in] data_type of the tensor to be cast to. Must be a numeric type.
explicit ToNumber(const std::string &data_type) : ToNumber(StringToChar(data_type)) {}

explicit ToNumber(const std::vector<char> &data_type);
/// \param[in] data_type mindspore::DataType of the tensor to be cast to. Must be a numeric type, excluding bool.
explicit ToNumber(mindspore::DataType data_type);


/// \brief Destructor /// \brief Destructor
~ToNumber() = default; ~ToNumber() = default;


+ 34
- 6
mindspore/ccsrc/minddata/dataset/text/ir/kernels/text_ir.cc View File

@@ -42,6 +42,7 @@
#include "minddata/dataset/text/kernels/whitespace_tokenizer_op.h" #include "minddata/dataset/text/kernels/whitespace_tokenizer_op.h"
#endif #endif
#include "minddata/dataset/core/data_type.h" #include "minddata/dataset/core/data_type.h"
#include "minddata/dataset/core/type_id.h"
#include "minddata/dataset/util/path.h" #include "minddata/dataset/util/path.h"


#include "minddata/dataset/text/ir/validators.h" #include "minddata/dataset/text/ir/validators.h"
@@ -166,10 +167,20 @@ Status JiebaTokenizerOperation::AddWord(const std::string &word, int64_t freq) {
} }


// LookupOperation // LookupOperation
// DataType data_type - required for C++ API
LookupOperation::LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::optional<std::string> &unknown_token, LookupOperation::LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::optional<std::string> &unknown_token,
const std::string &data_type)
DataType data_type)
: vocab_(vocab), unknown_token_(unknown_token), default_id_(Vocab::kNoTokenExists), data_type_(data_type) {} : vocab_(vocab), unknown_token_(unknown_token), default_id_(Vocab::kNoTokenExists), data_type_(data_type) {}


// std::string data_type - required for Pybind
LookupOperation::LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::optional<std::string> &unknown_token,
const std::string &data_type)
: vocab_(vocab), unknown_token_(unknown_token), default_id_(Vocab::kNoTokenExists) {
// Convert from string to DEType
DataType temp_data_type(data_type);
data_type_ = temp_data_type;
}

LookupOperation::~LookupOperation() = default; LookupOperation::~LookupOperation() = default;


Status LookupOperation::ValidateParams() { Status LookupOperation::ValidateParams() {
@@ -187,8 +198,9 @@ Status LookupOperation::ValidateParams() {
} }
} }


if (!IsTypeNumeric(data_type_)) {
std::string err_msg = "Lookup does not support a string to string mapping, data_type can only be numeric.";
if (!data_type_.IsNumeric()) {
// Note: For DEType, Bool is counted as numeric, and is a valid type for Lookup
std::string err_msg = "Lookup : The parameter data_type must be numeric including bool.";
MS_LOG(ERROR) << err_msg; MS_LOG(ERROR) << err_msg;
RETURN_STATUS_SYNTAX_ERROR(err_msg); RETURN_STATUS_SYNTAX_ERROR(err_msg);
} }
@@ -351,11 +363,20 @@ std::shared_ptr<TensorOp> SlidingWindowOperation::Build() {
} }


// ToNumberOperation // ToNumberOperation
ToNumberOperation::ToNumberOperation(std::string data_type) : data_type_(data_type) {}
// DataType data_type - required for C++ API
ToNumberOperation::ToNumberOperation(DataType data_type) : data_type_(data_type) {}

// std::string data_type - required for Pybind
ToNumberOperation::ToNumberOperation(std::string data_type) {
// Convert from string to DEType
DataType temp_data_type(data_type);
data_type_ = temp_data_type;
}


Status ToNumberOperation::ValidateParams() { Status ToNumberOperation::ValidateParams() {
if (!IsTypeNumeric(data_type_) || IsTypeBoolean(data_type_)) {
std::string err_msg = "ToNumber : The parameter data_type must be a numeric type, got: " + data_type_;
if (!data_type_.IsNumeric() || data_type_.IsBool()) {
// Note: For DEType, Bool is counted as numeric, but is not a valid type for ToNumber.
std::string err_msg = "ToNumber : The parameter data_type must be numeric and excludes bool.";
MS_LOG(ERROR) << err_msg; MS_LOG(ERROR) << err_msg;
RETURN_STATUS_SYNTAX_ERROR(err_msg); RETURN_STATUS_SYNTAX_ERROR(err_msg);
} }
@@ -368,6 +389,13 @@ std::shared_ptr<TensorOp> ToNumberOperation::Build() {
return tensor_op; return tensor_op;
} }


Status ToNumberOperation::to_json(nlohmann::json *out_json) {
nlohmann::json args;
args["data_type"] = data_type_.ToString();
*out_json = args;
return Status::OK();
}

// TruncateSequencePairOperation // TruncateSequencePairOperation
TruncateSequencePairOperation::TruncateSequencePairOperation(int32_t max_length) : max_length_(max_length) {} TruncateSequencePairOperation::TruncateSequencePairOperation(int32_t max_length) : max_length_(max_length) {}




+ 9
- 4
mindspore/ccsrc/minddata/dataset/text/ir/kernels/text_ir.h View File

@@ -142,7 +142,9 @@ class JiebaTokenizerOperation : public TensorOperation {
class LookupOperation : public TensorOperation { class LookupOperation : public TensorOperation {
public: public:
explicit LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::optional<std::string> &unknown_token, explicit LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::optional<std::string> &unknown_token,
const std::string &data_type);
DataType data_type); // Used for C++ API
explicit LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::optional<std::string> &unknown_token,
const std::string &data_type); // Used for Pybind


~LookupOperation(); ~LookupOperation();


@@ -156,7 +158,7 @@ class LookupOperation : public TensorOperation {
std::shared_ptr<Vocab> vocab_; std::shared_ptr<Vocab> vocab_;
std::optional<std::string> unknown_token_; std::optional<std::string> unknown_token_;
int32_t default_id_; int32_t default_id_;
std::string data_type_;
DataType data_type_;
}; };


class NgramOperation : public TensorOperation { class NgramOperation : public TensorOperation {
@@ -273,7 +275,8 @@ class SlidingWindowOperation : public TensorOperation {


class ToNumberOperation : public TensorOperation { class ToNumberOperation : public TensorOperation {
public: public:
explicit ToNumberOperation(std::string data_type);
explicit ToNumberOperation(DataType data_type); // Used for C++ API
explicit ToNumberOperation(std::string data_type); // Used for Pybind


~ToNumberOperation() = default; ~ToNumberOperation() = default;


@@ -283,8 +286,10 @@ class ToNumberOperation : public TensorOperation {


std::string Name() const override { return kToNumberOperation; } std::string Name() const override { return kToNumberOperation; }


Status to_json(nlohmann::json *out_json) override;

private: private:
std::string data_type_;
DataType data_type_;
}; };


class TruncateSequencePairOperation : public TensorOperation { class TruncateSequencePairOperation : public TensorOperation {


+ 6
- 0
tests/ut/cpp/dataset/c_api_text_sentence_piece_vocab_test.cc View File

@@ -89,6 +89,9 @@ TEST_F(MindDataTestPipeline, TestSentencePieceVocabSuccess1) {
} }


EXPECT_EQ(i, 1); EXPECT_EQ(i, 1);

// Manually terminate the pipeline
iter->Stop();
} }


TEST_F(MindDataTestPipeline, TestSentencePieceVocabSuccess2) { TEST_F(MindDataTestPipeline, TestSentencePieceVocabSuccess2) {
@@ -149,6 +152,9 @@ TEST_F(MindDataTestPipeline, TestSentencePieceVocabSuccess2) {
} }


EXPECT_EQ(i, 1); EXPECT_EQ(i, 1);

// Manually terminate the pipeline
iter->Stop();
} }


TEST_F(MindDataTestPipeline, TestSentencePieceVocabFail) { TEST_F(MindDataTestPipeline, TestSentencePieceVocabFail) {


+ 7
- 7
tests/ut/cpp/dataset/c_api_text_test.cc View File

@@ -1541,7 +1541,7 @@ TEST_F(MindDataTestPipeline, TestToNumberSuccess1) {
EXPECT_NE(ds, nullptr); EXPECT_NE(ds, nullptr);


// Create ToNumber operation on ds // Create ToNumber operation on ds
std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>("int64");
std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeInt64);
EXPECT_NE(to_number, nullptr); EXPECT_NE(to_number, nullptr);


// Create a Map operation on ds // Create a Map operation on ds
@@ -1596,7 +1596,7 @@ TEST_F(MindDataTestPipeline, TestToNumberSuccess2) {
EXPECT_NE(ds, nullptr); EXPECT_NE(ds, nullptr);


// Create ToNumber operation on ds // Create ToNumber operation on ds
std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>("float64");
std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeFloat64);
EXPECT_NE(to_number, nullptr); EXPECT_NE(to_number, nullptr);


// Create a Map operation on ds // Create a Map operation on ds
@@ -1651,7 +1651,7 @@ TEST_F(MindDataTestPipeline, TestToNumberFail1) {
EXPECT_NE(ds, nullptr); EXPECT_NE(ds, nullptr);


// Create ToNumber operation on ds // Create ToNumber operation on ds
std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>("int8");
std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeInt8);
EXPECT_NE(to_number, nullptr); EXPECT_NE(to_number, nullptr);


// Create a Map operation on ds // Create a Map operation on ds
@@ -1701,7 +1701,7 @@ TEST_F(MindDataTestPipeline, TestToNumberFail2) {
EXPECT_NE(ds, nullptr); EXPECT_NE(ds, nullptr);


// Create ToNumber operation on ds // Create ToNumber operation on ds
std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>("float16");
std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeFloat16);
EXPECT_NE(to_number, nullptr); EXPECT_NE(to_number, nullptr);


// Create a Map operation on ds // Create a Map operation on ds
@@ -1747,7 +1747,7 @@ TEST_F(MindDataTestPipeline, TestToNumberFail3) {
EXPECT_NE(ds, nullptr); EXPECT_NE(ds, nullptr);


// Create ToNumber operation on ds // Create ToNumber operation on ds
std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>("int64");
std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeInt64);
EXPECT_NE(to_number, nullptr); EXPECT_NE(to_number, nullptr);


// Create a Map operation on ds // Create a Map operation on ds
@@ -1789,7 +1789,7 @@ TEST_F(MindDataTestPipeline, TestToNumberFail4) {
EXPECT_NE(ds, nullptr); EXPECT_NE(ds, nullptr);


// Create ToNumber operation on ds // Create ToNumber operation on ds
std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>("string");
std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kObjectTypeString);
EXPECT_NE(to_number, nullptr); EXPECT_NE(to_number, nullptr);


// Create a Map operation on ds // Create a Map operation on ds
@@ -1812,7 +1812,7 @@ TEST_F(MindDataTestPipeline, TestToNumberFail5) {
EXPECT_NE(ds, nullptr); EXPECT_NE(ds, nullptr);


// Create ToNumber operation on ds // Create ToNumber operation on ds
std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>("bool");
std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeBool);
EXPECT_NE(to_number, nullptr); EXPECT_NE(to_number, nullptr);


// Create a Map operation on ds // Create a Map operation on ds


+ 118
- 10
tests/ut/cpp/dataset/c_api_text_vocab_test.cc View File

@@ -36,10 +36,10 @@ class MindDataTestPipeline : public UT::DatasetOpTesting {
}; };


// Macro to compare 2 MSTensors as not equal; compare datasize only // Macro to compare 2 MSTensors as not equal; compare datasize only
#define EXPECT_MSTENSOR_DATA_NE(_mstensor1, _mstensor2) \
do { \
EXPECT_NE(_mstensor1.DataSize(), _mstensor2.DataSize()); \
} while (false)
#define EXPECT_MSTENSOR_DATA_NE(_mstensor1, _mstensor2) \
do { \
EXPECT_NE(_mstensor1.DataSize(), _mstensor2.DataSize()); \
} while (false)


TEST_F(MindDataTestPipeline, TestVocabLookupOp) { TEST_F(MindDataTestPipeline, TestVocabLookupOp) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabLookupOp."; MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabLookupOp.";
@@ -56,7 +56,8 @@ TEST_F(MindDataTestPipeline, TestVocabLookupOp) {
EXPECT_EQ(s, Status::OK()); EXPECT_EQ(s, Status::OK());


// Create Lookup operation on ds // Create Lookup operation on ds
std::shared_ptr<TensorTransform> lookup = std::make_shared<text::Lookup>(vocab, "<unk>", "int32");
std::shared_ptr<TensorTransform> lookup =
std::make_shared<text::Lookup>(vocab, "<unk>", mindspore::DataType::kNumberTypeInt32);
EXPECT_NE(lookup, nullptr); EXPECT_NE(lookup, nullptr);


// Create Map operation on ds // Create Map operation on ds
@@ -87,6 +88,11 @@ TEST_F(MindDataTestPipeline, TestVocabLookupOp) {
ASSERT_OK(iter->GetNextRow(&row)); ASSERT_OK(iter->GetNextRow(&row));
i++; i++;
} }

EXPECT_EQ(i, 6);

// Manually terminate the pipeline
iter->Stop();
} }


TEST_F(MindDataTestPipeline, TestVocabLookupOpEmptyString) { TEST_F(MindDataTestPipeline, TestVocabLookupOpEmptyString) {
@@ -104,7 +110,8 @@ TEST_F(MindDataTestPipeline, TestVocabLookupOpEmptyString) {
EXPECT_EQ(s, Status::OK()); EXPECT_EQ(s, Status::OK());


// Create Lookup operation on ds // Create Lookup operation on ds
std::shared_ptr<TensorTransform> lookup = std::make_shared<text::Lookup>(vocab, "", "int32");
std::shared_ptr<TensorTransform> lookup =
std::make_shared<text::Lookup>(vocab, "", mindspore::DataType::kNumberTypeInt32);
EXPECT_NE(lookup, nullptr); EXPECT_NE(lookup, nullptr);


// Create Map operation on ds // Create Map operation on ds
@@ -135,6 +142,60 @@ TEST_F(MindDataTestPipeline, TestVocabLookupOpEmptyString) {
ASSERT_OK(iter->GetNextRow(&row)); ASSERT_OK(iter->GetNextRow(&row));
i++; i++;
} }

EXPECT_EQ(i, 6);

// Manually terminate the pipeline
iter->Stop();
}

TEST_F(MindDataTestPipeline, TestVocabLookupBool) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabLookupBool.";
// Invoke Lookup with Bool data_type

// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/testVocab/words.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);

// Create a vocab from vector
std::vector<std::string> list = {"home", "IS", "behind", "the", "world", "ahead", "!"};
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
Status s = Vocab::BuildFromVector(list, {"<pad>", "<unk>"}, true, &vocab);
EXPECT_EQ(s, Status::OK());

// Create Lookup operation on ds
std::shared_ptr<TensorTransform> lookup =
std::make_shared<text::Lookup>(vocab, "<unk>", mindspore::DataType::kNumberTypeBool);
EXPECT_NE(lookup, nullptr);

// Create Map operation on ds
ds = ds->Map({lookup}, {"text"});
EXPECT_NE(ds, nullptr);

// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);

// Iterate the dataset and get each row
std::unordered_map<std::string, mindspore::MSTensor> row;
ASSERT_OK(iter->GetNextRow(&row));

uint64_t i = 0;
while (row.size() != 0) {
auto ind = row["text"];
MS_LOG(INFO) << ind.Shape();
TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);

ASSERT_OK(iter->GetNextRow(&row));
i++;
}

EXPECT_EQ(i, 6);

// Manually terminate the pipeline
iter->Stop();
} }


TEST_F(MindDataTestPipeline, TestVocabLookupOpFail1) { TEST_F(MindDataTestPipeline, TestVocabLookupOpFail1) {
@@ -151,7 +212,8 @@ TEST_F(MindDataTestPipeline, TestVocabLookupOpFail1) {
EXPECT_EQ(s, Status::OK()); EXPECT_EQ(s, Status::OK());


// Create lookup op for ds // Create lookup op for ds
std::shared_ptr<TensorTransform> lookup = std::make_shared<text::Lookup>(vocab, "<unk>", "int32");
std::shared_ptr<TensorTransform> lookup =
std::make_shared<text::Lookup>(vocab, "<unk>", mindspore::DataType::kNumberTypeInt32);
EXPECT_NE(lookup, nullptr); EXPECT_NE(lookup, nullptr);


// Create a Map operation on ds // Create a Map operation on ds
@@ -174,7 +236,8 @@ TEST_F(MindDataTestPipeline, TestVocabLookupOpFail2) {
std::shared_ptr<Vocab> vocab; std::shared_ptr<Vocab> vocab;


// Create lookup op // Create lookup op
std::shared_ptr<TensorTransform> lookup = std::make_shared<text::Lookup>(vocab, "", "int32");
std::shared_ptr<TensorTransform> lookup =
std::make_shared<text::Lookup>(vocab, "", mindspore::DataType::kNumberTypeInt32);
EXPECT_NE(lookup, nullptr); EXPECT_NE(lookup, nullptr);


// Create a Map operation on ds // Create a Map operation on ds
@@ -186,6 +249,33 @@ TEST_F(MindDataTestPipeline, TestVocabLookupOpFail2) {
EXPECT_EQ(iter, nullptr); EXPECT_EQ(iter, nullptr);
} }


TEST_F(MindDataTestPipeline, TestVocabLookupOpFail3DataType) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabLookupOpFail3DataType.";
// Create a TextFile Dataset
std::string data_file = datasets_root_path_ + "/testVocab/words.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);

// Build vocab from vector
std::vector<std::string> list = {"home", "IS", "behind", "the", "world", "ahead", "!"};
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
Status s = Vocab::BuildFromVector(list, {"<pad>", "<unk>"}, true, &vocab);
EXPECT_EQ(s, Status::OK());

// Create lookup op for ds
std::shared_ptr<TensorTransform> lookup =
std::make_shared<text::Lookup>(vocab, "", mindspore::DataType::kObjectTypeString);
EXPECT_NE(lookup, nullptr);

// Create a Map operation on ds
ds = ds->Map({lookup});
EXPECT_NE(ds, nullptr);

std::shared_ptr<Iterator> iter = ds->CreateIterator();
// Expect failure: invalid Lookup input (String is not valid for data_type)
EXPECT_EQ(iter, nullptr);
}

TEST_F(MindDataTestPipeline, TestVocabFromDataset) { TEST_F(MindDataTestPipeline, TestVocabFromDataset) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabFromDataset."; MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabFromDataset.";


@@ -204,7 +294,8 @@ TEST_F(MindDataTestPipeline, TestVocabFromDataset) {
EXPECT_EQ(home_index, 4); EXPECT_EQ(home_index, 4);


// Create Lookup operation on ds // Create Lookup operation on ds
std::shared_ptr<TensorTransform> lookup = std::make_shared<text::Lookup>(vocab, "<unk>", "int32");
std::shared_ptr<TensorTransform> lookup =
std::make_shared<text::Lookup>(vocab, "<unk>", mindspore::DataType::kNumberTypeInt32);
EXPECT_NE(lookup, nullptr); EXPECT_NE(lookup, nullptr);


// Create Map operation on ds // Create Map operation on ds
@@ -235,6 +326,11 @@ TEST_F(MindDataTestPipeline, TestVocabFromDataset) {
ASSERT_OK(iter->GetNextRow(&row)); ASSERT_OK(iter->GetNextRow(&row));
i++; i++;
} }

EXPECT_EQ(i, 6);

// Manually terminate the pipeline
iter->Stop();
} }


TEST_F(MindDataTestPipeline, TestVocabFromDatasetDefault) { TEST_F(MindDataTestPipeline, TestVocabFromDatasetDefault) {
@@ -254,6 +350,7 @@ TEST_F(MindDataTestPipeline, TestVocabFromDatasetDefault) {
EXPECT_EQ(home_index, 2); EXPECT_EQ(home_index, 2);


// Create Lookup operation on ds // Create Lookup operation on ds
// Use default data_type parameter
std::shared_ptr<TensorTransform> lookup = std::make_shared<text::Lookup>(vocab, "home"); std::shared_ptr<TensorTransform> lookup = std::make_shared<text::Lookup>(vocab, "home");
EXPECT_NE(lookup, nullptr); EXPECT_NE(lookup, nullptr);


@@ -293,6 +390,11 @@ TEST_F(MindDataTestPipeline, TestVocabFromDatasetDefault) {
ASSERT_OK(iter->GetNextRow(&row)); ASSERT_OK(iter->GetNextRow(&row));
i++; i++;
} }

EXPECT_EQ(i, 6);

// Manually terminate the pipeline
iter->Stop();
} }


TEST_F(MindDataTestPipeline, TestVocabFromDatasetFail1) { TEST_F(MindDataTestPipeline, TestVocabFromDatasetFail1) {
@@ -371,7 +473,8 @@ TEST_F(MindDataTestPipeline, TestVocabFromDatasetInt64) {
EXPECT_EQ(home_index, 2); EXPECT_EQ(home_index, 2);


// Create Lookup operation on ds // Create Lookup operation on ds
std::shared_ptr<TensorTransform> lookup = std::make_shared<text::Lookup>(vocab, "home", "int64");
std::shared_ptr<TensorTransform> lookup =
std::make_shared<text::Lookup>(vocab, "home", mindspore::DataType::kNumberTypeInt64);
EXPECT_NE(lookup, nullptr); EXPECT_NE(lookup, nullptr);


// Create Map operation on ds // Create Map operation on ds
@@ -410,4 +513,9 @@ TEST_F(MindDataTestPipeline, TestVocabFromDatasetInt64) {
ASSERT_OK(iter->GetNextRow(&row)); ASSERT_OK(iter->GetNextRow(&row));
i++; i++;
} }

EXPECT_EQ(i, 6);

// Manually terminate the pipeline
iter->Stop();
} }

+ 1
- 1
tests/ut/python/dataset/test_vocab.py View File

@@ -202,7 +202,7 @@ def test_lookup_cast_type():
assert test_config("unk") == np.dtype("int32") assert test_config("unk") == np.dtype("int32")
# test exception, data_type isn't the correct type # test exception, data_type isn't the correct type
assert "tldr is not of type [<class 'mindspore._c_expression.typing.Type'>]" in test_config("unk", "tldr") assert "tldr is not of type [<class 'mindspore._c_expression.typing.Type'>]" in test_config("unk", "tldr")
assert "Lookup does not support a string to string mapping, data_type can only be numeric." in \
assert "Lookup : The parameter data_type must be numeric including bool." in \
test_config("w1", mstype.string) test_config("w1", mstype.string)






Loading…
Cancel
Save