Browse Source

!10799 make user-facing headers standalone for minddata

From: @mhmotallebi
Reviewed-by: 
Signed-off-by:
tags/v1.2.0-rc1
mindspore-ci-bot Gitee 5 years ago
parent
commit
a477a97278
33 changed files with 182 additions and 83 deletions
  1. +1
    -0
      mindspore/ccsrc/minddata/dataset/api/config.cc
  2. +18
    -1
      mindspore/ccsrc/minddata/dataset/api/datasets.cc
  3. +4
    -0
      mindspore/ccsrc/minddata/dataset/api/iterator.cc
  4. +2
    -0
      mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/include/datasets_bindings.cc
  5. +1
    -0
      mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/bindings.cc
  6. +28
    -8
      mindspore/ccsrc/minddata/dataset/api/text.cc
  7. +3
    -0
      mindspore/ccsrc/minddata/dataset/core/constants.h
  8. +1
    -1
      mindspore/ccsrc/minddata/dataset/core/global_context.h
  9. +3
    -1
      mindspore/ccsrc/minddata/dataset/engine/consumers/tree_consumer.cc
  10. +1
    -0
      mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/build_sentence_piece_vocab_node.h
  11. +14
    -1
      mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/dataset_node.h
  12. +1
    -0
      mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/minddata_node.h
  13. +1
    -0
      mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/random_node.cc
  14. +2
    -0
      mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/random_node.h
  15. +0
    -2
      mindspore/ccsrc/minddata/dataset/include/config.h
  16. +3
    -0
      mindspore/ccsrc/minddata/dataset/include/constants.h
  17. +18
    -13
      mindspore/ccsrc/minddata/dataset/include/datasets.h
  18. +4
    -4
      mindspore/ccsrc/minddata/dataset/include/de_tensor.h
  19. +5
    -4
      mindspore/ccsrc/minddata/dataset/include/execute.h
  20. +2
    -3
      mindspore/ccsrc/minddata/dataset/include/iterator.h
  21. +2
    -1
      mindspore/ccsrc/minddata/dataset/include/samplers.h
  22. +22
    -0
      mindspore/ccsrc/minddata/dataset/include/status.h
  23. +17
    -18
      mindspore/ccsrc/minddata/dataset/include/text.h
  24. +0
    -1
      mindspore/ccsrc/minddata/dataset/include/type_id.h
  25. +2
    -1
      mindspore/ccsrc/minddata/dataset/text/sentence_piece_vocab.h
  26. +1
    -0
      tests/ut/cpp/dataset/build_vocab_test.cc
  27. +0
    -3
      tests/ut/cpp/dataset/c_api_cache_test.cc
  28. +1
    -0
      tests/ut/cpp/dataset/c_api_dataset_ops_test.cc
  29. +1
    -0
      tests/ut/cpp/dataset/c_api_samplers_test.cc
  30. +3
    -2
      tests/ut/cpp/dataset/c_api_text_sentence_piece_vocab_test.cc
  31. +12
    -12
      tests/ut/cpp/dataset/c_api_text_test.cc
  32. +8
    -7
      tests/ut/cpp/dataset/c_api_text_vocab_test.cc
  33. +1
    -0
      tests/ut/cpp/dataset/ir_callback_test.cc

+ 1
- 0
mindspore/ccsrc/minddata/dataset/api/config.cc View File

@@ -17,6 +17,7 @@
#include "minddata/dataset/core/config_manager.h"
#include "minddata/dataset/core/global_context.h"
#include "minddata/dataset/include/config.h"
#include "minddata/dataset/util/log_adapter.h"
#include "minddata/dataset/util/status.h"

namespace mindspore {


+ 18
- 1
mindspore/ccsrc/minddata/dataset/api/datasets.cc View File

@@ -19,16 +19,33 @@
#include <fstream>
#include <unordered_set>
#include <utility>

#include "minddata/dataset/engine/runtime_context.h"
#include "minddata/dataset/include/samplers.h"
#include "minddata/dataset/include/transforms.h"
#include "minddata/dataset/util/path.h"
#include "minddata/dataset/util/status.h"

#include "minddata/dataset/core/client.h"
#include "minddata/dataset/engine/consumers/tree_consumer.h"

#include "minddata/dataset/kernels/c_func_op.h"
#include "minddata/dataset/kernels/tensor_op.h"

#ifndef ENABLE_ANDROID
#include "minddata/dataset/engine/ir/cache/dataset_cache_impl.h"
#endif

#ifndef ENABLE_ANDROID
#include "minddata/dataset/text/sentence_piece_vocab.h"
#include "minddata/dataset/text/vocab.h"
#endif

// Sampler headers (in alphabetical order)
#include "minddata/dataset/engine/datasetops/source/sampler/sampler.h"

#include "minddata/dataset/engine/ir/datasetops/dataset_node.h"

// IR non-leaf nodes
#include "minddata/dataset/engine/ir/datasetops/batch_node.h"
#ifndef ENABLE_ANDROID
@@ -57,7 +74,6 @@
#endif

#include "minddata/dataset/core/config_manager.h"
#include "minddata/dataset/util/path.h"
#include "minddata/dataset/util/random.h"
#include "minddata/dataset/util/services.h"

@@ -939,6 +955,7 @@ TFRecordDataset::TFRecordDataset(const std::vector<std::string> &dataset_files,
shard_id, shard_equal_rows, cache);
ir_node_ = std::static_pointer_cast<DatasetNode>(ds);
}

#endif
} // namespace dataset
} // namespace mindspore

+ 4
- 0
mindspore/ccsrc/minddata/dataset/api/iterator.cc View File

@@ -16,11 +16,15 @@
#include "minddata/dataset/include/iterator.h"
#include "minddata/dataset/core/client.h"
#include "minddata/dataset/engine/consumers/tree_consumer.h"
#include "minddata/dataset/engine/runtime_context.h"
#include "minddata/dataset/include/datasets.h"

namespace mindspore {
namespace dataset {

Iterator::Iterator() : consumer_(nullptr) {}
Iterator::~Iterator() { Stop(); }

// Get the next row from the data pipeline.
bool Iterator::GetNextRow(TensorMap *row) {
Status rc = consumer_->GetNextAsMap(row);


+ 2
- 0
mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/include/datasets_bindings.cc View File

@@ -23,6 +23,8 @@
#include "minddata/dataset/core/constants.h"
#include "minddata/dataset/core/global_context.h"
#include "minddata/dataset/include/datasets.h"
#include "minddata/dataset/text/sentence_piece_vocab.h"

// IR non-leaf nodes
#include "minddata/dataset/engine/ir/datasetops/batch_node.h"
#include "minddata/dataset/engine/ir/datasetops/concat_node.h"


+ 1
- 0
mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/bindings.cc View File

@@ -21,6 +21,7 @@
#include "minddata/dataset/api/python/pybind_register.h"
#include "minddata/dataset/text/vocab.h"
#include "minddata/dataset/text/sentence_piece_vocab.h"
#include "minddata/dataset/include/constants.h"

namespace mindspore {
namespace dataset {


+ 28
- 8
mindspore/ccsrc/minddata/dataset/api/text.cc View File

@@ -39,6 +39,7 @@
#include "minddata/dataset/text/kernels/unicode_script_tokenizer_op.h"
#include "minddata/dataset/text/kernels/whitespace_tokenizer_op.h"
#endif
#include "minddata/dataset/core/data_type.h"
#include "minddata/dataset/util/path.h"

namespace mindspore {
@@ -87,7 +88,7 @@ std::shared_ptr<JiebaTokenizerOperation> JiebaTokenizer(const std::string &hmm_p
}

std::shared_ptr<LookupOperation> Lookup(const std::shared_ptr<Vocab> &vocab, const std::string &unknown_token,
const DataType &data_type) {
const std::string &data_type) {
auto op = std::make_shared<LookupOperation>(vocab, unknown_token, data_type);

return op->ValidateParams() ? op : nullptr;
@@ -142,7 +143,7 @@ std::shared_ptr<SlidingWindowOperation> SlidingWindow(const int32_t width, const
return op->ValidateParams() ? op : nullptr;
}

std::shared_ptr<ToNumberOperation> ToNumber(const DataType data_type) {
std::shared_ptr<ToNumberOperation> ToNumber(const std::string &data_type) {
auto op = std::make_shared<ToNumberOperation>(data_type);

return op->ValidateParams() ? op : nullptr;
@@ -200,6 +201,19 @@ Status ValidateTokenizerDirParam(const std::string &tokenizer_name, const std::s
return Status::OK();
}

// Helper functions to help validate data type passed by user
bool IsTypeNumeric(const std::string &data_type) {
if (data_type == "int8" || data_type == "uint8" || data_type == "int16" || data_type == "uint16" ||
data_type == "int32" || data_type == "uint32" || data_type == "int64" || data_type == "uint64" ||
data_type == "float16" || data_type == "float32" || data_type == "float64")
return true;
return false;
}

bool IsTypeBoolean(const std::string &data_type) { return data_type == "bool"; }

bool IsTypeString(const std::string &data_type) { return data_type == "string"; }

/* ####################################### Derived TensorOperation classes ################################# */

// (In alphabetical order)
@@ -239,6 +253,8 @@ BertTokenizerOperation::BertTokenizerOperation(const std::shared_ptr<Vocab> &voc
preserve_unused_token_(preserve_unused_token),
with_offsets_(with_offsets) {}

BertTokenizerOperation::~BertTokenizerOperation() = default;

Status BertTokenizerOperation::ValidateParams() {
if (vocab_ == nullptr) {
std::string err_msg = "BertTokenizer: vocab object type is incorrect or null.";
@@ -303,9 +319,11 @@ std::shared_ptr<TensorOp> JiebaTokenizerOperation::Build() {

// LookupOperation
LookupOperation::LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::string &unknown_token,
const DataType &data_type)
const std::string &data_type)
: vocab_(vocab), unknown_token_(unknown_token), default_id_(Vocab::kNoTokenExists), data_type_(data_type) {}

LookupOperation::~LookupOperation() = default;

Status LookupOperation::ValidateParams() {
if (vocab_ == nullptr) {
std::string err_msg = "Lookup: vocab object type is incorrect or null.";
@@ -320,7 +338,7 @@ Status LookupOperation::ValidateParams() {
RETURN_STATUS_SYNTAX_ERROR(err_msg);
}

if (!data_type_.IsNumeric()) {
if (!IsTypeNumeric(data_type_)) {
std::string err_msg = "Lookup does not support a string to string mapping, data_type can only be numeric.";
MS_LOG(ERROR) << err_msg;
RETURN_STATUS_SYNTAX_ERROR(err_msg);
@@ -330,7 +348,7 @@ Status LookupOperation::ValidateParams() {
}

std::shared_ptr<TensorOp> LookupOperation::Build() {
std::shared_ptr<LookupOp> tensor_op = std::make_shared<LookupOp>(vocab_, default_id_, data_type_);
std::shared_ptr<LookupOp> tensor_op = std::make_shared<LookupOp>(vocab_, default_id_, DataType(data_type_));
return tensor_op;
}

@@ -419,6 +437,8 @@ std::shared_ptr<TensorOp> RegexTokenizerOperation::Build() {
#endif

// SentencePieceTokenizerOperation
SentencePieceTokenizerOperation::~SentencePieceTokenizerOperation() = default;

SentencePieceTokenizerOperation::SentencePieceTokenizerOperation(const std::shared_ptr<SentencePieceVocab> &vocab,
SPieceTokenizerOutType out_type)
: vocab_(vocab), vocab_path_(std::string()), load_type_(SPieceTokenizerLoadType::kModel), out_type_(out_type) {}
@@ -482,11 +502,11 @@ std::shared_ptr<TensorOp> SlidingWindowOperation::Build() {
}

// ToNumberOperation
ToNumberOperation::ToNumberOperation(DataType data_type) : data_type_(data_type) {}
ToNumberOperation::ToNumberOperation(std::string data_type) : data_type_(data_type) {}

Status ToNumberOperation::ValidateParams() {
if (!data_type_.IsNumeric() || data_type_.IsBool()) {
std::string err_msg = "ToNumber : The parameter data_type must be a numeric type, got: " + data_type_.ToString();
if (!IsTypeNumeric(data_type_) || IsTypeBoolean(data_type_)) {
std::string err_msg = "ToNumber : The parameter data_type must be a numeric type, got: " + data_type_;
MS_LOG(ERROR) << err_msg;
RETURN_STATUS_SYNTAX_ERROR(err_msg);
}


+ 3
- 0
mindspore/ccsrc/minddata/dataset/core/constants.h View File

@@ -59,6 +59,9 @@ enum class SPieceTokenizerOutType { kString = 0, kInt = 1 };
// Possible values for SPieceTokenizerLoadType
enum class SPieceTokenizerLoadType { kFile = 0, kModel = 1 };

// Possible values for SentencePieceModel
enum class SentencePieceModel { kUnigram = 0, kBpe = 1, kChar = 2, kWord = 3 };

// Possible values for NormalizeForm
enum class NormalizeForm {
kNone = 0,


+ 1
- 1
mindspore/ccsrc/minddata/dataset/core/global_context.h View File

@@ -19,6 +19,7 @@
#include <memory>
#include <mutex>

#include "minddata/dataset/core/config_manager.h"
#include "minddata/dataset/core/constants.h"
#include "minddata/dataset/util/allocator.h"
#include "minddata/dataset/util/status.h"
@@ -27,7 +28,6 @@ namespace mindspore {
namespace dataset {
// forward declare
class MemoryPool;
class ConfigManager;
class Tensor;
class CVTensor;



+ 3
- 1
mindspore/ccsrc/minddata/dataset/engine/consumers/tree_consumer.cc View File

@@ -22,8 +22,10 @@
#include <utility>
#include <vector>
#include "minddata/dataset/engine/consumers/tree_consumer.h"
#include "minddata/dataset/engine/tree_adapter.h"
#include "minddata/dataset/engine/datasetops/device_queue_op.h"
#include "minddata/dataset/engine/opt/pre/getter_pass.h"
#include "minddata/dataset/engine/tree_adapter.h"
#include "minddata/mindrecord/include/shard_index_generator.h"

#ifndef ENABLE_ANDROID
#include "minddata/mindrecord/include/shard_header.h"


+ 1
- 0
mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/build_sentence_piece_vocab_node.h View File

@@ -23,6 +23,7 @@
#include <unordered_map>
#include <vector>

#include "minddata/dataset/engine/ir/datasetops/dataset_node.h"
#include "minddata/dataset/include/datasets.h"

namespace mindspore {


+ 14
- 1
mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/dataset_node.h View File

@@ -24,13 +24,26 @@
#include <utility>
#include <vector>

#include "minddata/dataset/include/datasets.h"
#include "minddata/dataset/core/config_manager.h"
#include "minddata/dataset/engine/consumers/tree_consumer.h"
#include "minddata/dataset/engine/data_schema.h"
#include "minddata/dataset/engine/datasetops/filter_op.h"
#include "minddata/dataset/engine/datasetops/map_op/map_op.h"
#include "minddata/dataset/engine/datasetops/project_op.h"
#include "minddata/dataset/engine/datasetops/repeat_op.h"
#include "minddata/dataset/engine/datasetops/shuffle_op.h"
#include "minddata/dataset/engine/datasetops/skip_op.h"
#include "minddata/dataset/engine/datasetops/take_op.h"
#include "minddata/dataset/engine/ir/cache/dataset_cache.h"
#include "minddata/dataset/include/datasets.h"
#include "minddata/dataset/util/path.h"
#include "minddata/dataset/util/status.h"

namespace mindspore {
namespace dataset {

class Dataset;
class DatasetCache;
class SamplerObj;
class IRNodePass;
class DatasetSizeGetter;


+ 1
- 0
mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/minddata_node.h View File

@@ -22,6 +22,7 @@
#include <string>
#include <vector>

#include "minddata/dataset/engine/datasetops/source/mindrecord_op.h"
#include "minddata/dataset/engine/ir/datasetops/dataset_node.h"

namespace mindspore {


+ 1
- 0
mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/random_node.cc View File

@@ -24,6 +24,7 @@
#include "minddata/dataset/engine/datasetops/source/random_data_op.h"
#include "minddata/dataset/util/random.h"
#include "minddata/dataset/util/status.h"

namespace mindspore {
namespace dataset {



+ 2
- 0
mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/random_node.h View File

@@ -22,7 +22,9 @@
#include <utility>
#include <vector>

#include "minddata/dataset/engine/datasetops/source/sampler/sampler.h"
#include "minddata/dataset/engine/ir/datasetops/dataset_node.h"
#include "minddata/dataset/include/samplers.h"

namespace mindspore {
namespace dataset {


+ 0
- 2
mindspore/ccsrc/minddata/dataset/include/config.h View File

@@ -20,8 +20,6 @@
#include <cstdint>
#include <string>

#include "minddata/dataset/util/log_adapter.h"

namespace mindspore {
namespace dataset {



+ 3
- 0
mindspore/ccsrc/minddata/dataset/include/constants.h View File

@@ -59,6 +59,9 @@ enum class SPieceTokenizerOutType { kString = 0, kInt = 1 };
// Possible values for SPieceTokenizerLoadType
enum class SPieceTokenizerLoadType { kFile = 0, kModel = 1 };

// Possible values for SentencePieceModel
enum class SentencePieceModel { kUnigram = 0, kBpe = 1, kChar = 2, kWord = 3 };

// Possible values for NormalizeForm
enum class NormalizeForm {
kNone = 0,


+ 18
- 13
mindspore/ccsrc/minddata/dataset/include/datasets.h View File

@@ -17,6 +17,7 @@
#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_DATASETS_H_
#define MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_DATASETS_H_

#include <sys/stat.h>
#include <unistd.h>
#include <map>
#include <memory>
@@ -26,27 +27,18 @@
#include <unordered_set>
#include <utility>
#include <vector>
#include "minddata/dataset/engine/ir/cache/dataset_cache.h"

#include "minddata/dataset/core/constants.h"
#include "minddata/dataset/engine/consumers/tree_consumer.h"
#include "minddata/dataset/engine/ir/datasetops/dataset_node.h"
#include "minddata/dataset/include/iterator.h"
#include "minddata/dataset/include/samplers.h"
#include "minddata/dataset/include/tensor.h"
#include "minddata/dataset/include/text.h"
#include "minddata/dataset/include/type_id.h"
#include "minddata/dataset/kernels/c_func_op.h"
#include "minddata/dataset/kernels/tensor_op.h"
#include "minddata/dataset/util/path.h"
#ifndef ENABLE_ANDROID
#include "minddata/dataset/text/sentence_piece_vocab.h"
#include "minddata/dataset/text/vocab.h"
#endif

namespace mindspore {
namespace dataset {

class Tensor;
class TensorRow;
class TensorShape;
class TreeAdapter;
class TreeGetters;
@@ -54,6 +46,7 @@ class TreeGetters;
class Vocab;
#endif

class DatasetCache;
class DatasetNode;

class Iterator;
@@ -77,12 +70,20 @@ class ConcatDataset;
class RenameDataset;
#endif

#ifndef ENABLE_ANDROID
class SentencePieceVocab;
enum class SentencePieceModel;
#endif

class DSCallback;

class RepeatDataset;

#ifndef ENABLE_ANDROID
class SkipDataset;
class TakeDataset;
class ZipDataset;

#endif

/// \class Dataset datasets.h
@@ -969,8 +970,12 @@ std::shared_ptr<TFRecordDataset> TFRecord(const std::vector<std::string> &datase
} else {
std::string schema_path = schema;
if (!schema_path.empty()) {
Path schema_file(schema_path);
if (!schema_file.Exists()) {
struct stat sb;
int rc = stat(common::SafeCStr(schema_path), &sb);
if (rc == -1 && errno != ENOENT) {
MS_LOG(WARNING) << "Unable to query the status of [" << schema_path << "]. Errno = " << errno << ".";
}
if (rc != 0) {
MS_LOG(ERROR) << "TFRecordDataset: schema path [" << schema_path << "] is invalid or does not exist.";
return nullptr;
}


+ 4
- 4
mindspore/ccsrc/minddata/dataset/include/de_tensor.h View File

@@ -14,14 +14,14 @@
* limitations under the License.
*/

#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_API_DETENSOR_H_
#define MINDSPORE_CCSRC_MINDDATA_DATASET_API_DETENSOR_H_
#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_DETENSOR_H_
#define MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_DETENSOR_H_
#include <string>
#include <vector>
#include <memory>
#include "include/ms_tensor.h"
#include "minddata/dataset/include/status.h"
#include "minddata/dataset/include/tensor.h"
#include "minddata/dataset/util/status.h"
namespace mindspore {
namespace tensor {
class DETensor : public mindspore::tensor::MSTensor {
@@ -79,4 +79,4 @@ class DETensor : public mindspore::tensor::MSTensor {
};
} // namespace tensor
} // namespace mindspore
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_API_DETENSOR_H_
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_DETENSOR_H_

+ 5
- 4
mindspore/ccsrc/minddata/dataset/include/execute.h View File

@@ -14,12 +14,13 @@
* limitations under the License.
*/

#ifndef DATASET_API_EXECUTE_H_
#define DATASET_API_EXECUTE_H_
#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_EXECUTE_H_
#define MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_EXECUTE_H_

#include <vector>
#include <memory>
#include "minddata/dataset/core/constants.h"

#include "minddata/dataset/include/constants.h"
#ifdef ENABLE_ANDROID
#include "minddata/dataset/include/de_tensor.h"
#endif
@@ -55,4 +56,4 @@ class Execute {

} // namespace dataset
} // namespace mindspore
#endif // DATASET_API_EXECUTE_H_
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_EXECUTE_H_

+ 2
- 3
mindspore/ccsrc/minddata/dataset/include/iterator.h View File

@@ -21,7 +21,6 @@
#include <string>
#include <unordered_map>
#include <vector>
#include "minddata/dataset/engine/runtime_context.h"
#include "minddata/dataset/include/status.h"

namespace mindspore {
@@ -45,10 +44,10 @@ using TensorVec = std::vector<std::shared_ptr<Tensor>>;
class Iterator {
public:
/// \brief Constructor
Iterator() : consumer_(nullptr) {}
Iterator();

/// \brief Destructor
~Iterator() { Stop(); }
~Iterator();

/// \brief Method for building and launching the pipeline.
/// \param[in] ops - a vector of DatasetOp in the data pipeline.


+ 2
- 1
mindspore/ccsrc/minddata/dataset/include/samplers.h View File

@@ -21,10 +21,11 @@
#include <string>
#include <vector>

#include "minddata/dataset/util/status.h"
#include "minddata/dataset/include/status.h"
#ifndef ENABLE_ANDROID
#include "minddata/mindrecord/include/shard_column.h"
#include "minddata/mindrecord/include/shard_error.h"
#include "minddata/mindrecord/include/shard_operator.h"
#include "minddata/mindrecord/include/shard_reader.h"
#endif



+ 22
- 0
mindspore/ccsrc/minddata/dataset/include/status.h View File

@@ -51,6 +51,13 @@ namespace dataset {
} \
} while (false)

#define CHECK_FAIL_RETURN_SYNTAX_ERROR(_condition, _e) \
do { \
if (!(_condition)) { \
return Status(StatusCode::kSyntaxError, __LINE__, __FILE__, _e); \
} \
} while (false)

#define RETURN_UNEXPECTED_IF_NULL(_ptr) \
do { \
if ((_ptr) == nullptr) { \
@@ -71,6 +78,15 @@ namespace dataset {
return Status(StatusCode::kSyntaxError, __LINE__, __FILE__, _e); \
} while (false)

#define RETURN_SECOND_IF_ERROR(_s, _r) \
do { \
Status __rc = (_s); \
if (__rc.IsError()) { \
MS_LOG(ERROR) << __rc; \
return _r; \
} \
} while (false)

enum class StatusCode : char {
kOK = 0,
kOutOfMemory = 1,
@@ -151,6 +167,12 @@ class Status {
StatusCode code_;
std::string err_msg_;
};

#if !defined(_WIN32) && !defined(_WIN64)
const float MAX_MEMORY_USAGE_THRESHOLD = 0.95;

float GetMemoryUsage();
#endif
} // namespace dataset
} // namespace mindspore
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_UTIL_STATUS_H_

+ 17
- 18
mindspore/ccsrc/minddata/dataset/include/text.h View File

@@ -22,18 +22,16 @@
#include <utility>
#include <vector>

#include "mindspore/ccsrc/minddata/dataset/core/data_type.h"
#include "minddata/dataset/core/constants.h"
#include "minddata/dataset/include/constants.h"
#include "minddata/dataset/include/status.h"
#include "minddata/dataset/include/transforms.h"
#include "minddata/dataset/util/status.h"

#include "minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h"
#include "minddata/dataset/text/sentence_piece_vocab.h"
#include "minddata/dataset/text/vocab.h"

namespace mindspore {
namespace dataset {

class Vocab;
class SentencePieceVocab;

// Transform operations for text
namespace text {

@@ -146,10 +144,11 @@ std::shared_ptr<JiebaTokenizerOperation> JiebaTokenizer(const std::string &hmm_p
/// \param[in] vocab a Vocab object.
/// \param[in] unknown_token word to use for lookup if the word being looked up is out of Vocabulary (oov).
/// If unknown_token is oov, runtime error will be thrown.
/// \param[in] DataType type of the tensor after lookup, typically int32.
/// \param[in] data_type type of the tensor after lookup, typically int32.
/// \return Shared pointer to the current TensorOperation.

std::shared_ptr<LookupOperation> Lookup(const std::shared_ptr<Vocab> &vocab, const std::string &unknown_token,
const mindspore::dataset::DataType &data_type = DataType("int32"));
const std::string &data_type = "int32");

/// \brief TensorOp to generate n-gram from a 1-D string Tensor.
/// \param[in] ngrams ngrams is a vector of positive integers. For example, if ngrams={4, 3}, then the result
@@ -226,9 +225,9 @@ std::shared_ptr<SlidingWindowOperation> SlidingWindow(const int32_t width, const
/// https://en.cppreference.com/w/cpp/string/basic_string/stof,
/// https://en.cppreference.com/w/cpp/string/basic_string/stoul,
/// except that any strings which represent negative numbers cannot be cast to an unsigned integer type.
/// \param[in] data_type DataType of the tensor to be casted to. Must be a numeric type.
/// \param[in] data_type of the tensor to be casted to. Must be a numeric type.
/// \return Shared pointer to the current TensorOperation.
std::shared_ptr<ToNumberOperation> ToNumber(const DataType data_type);
std::shared_ptr<ToNumberOperation> ToNumber(const std::string &data_type);

/// \brief Truncate a pair of rank-1 tensors such that the total length is less than max_length.
/// \param[in] max_length Maximum length required.
@@ -285,7 +284,7 @@ class BertTokenizerOperation : public TensorOperation {
bool keep_whitespace, const NormalizeForm normalize_form, bool preserve_unused_token,
bool with_offsets);

~BertTokenizerOperation() = default;
~BertTokenizerOperation();

std::shared_ptr<TensorOp> Build() override;

@@ -342,9 +341,9 @@ class JiebaTokenizerOperation : public TensorOperation {
class LookupOperation : public TensorOperation {
public:
explicit LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::string &unknown_token,
const DataType &data_type);
const std::string &data_type);

~LookupOperation() = default;
~LookupOperation();

std::shared_ptr<TensorOp> Build() override;

@@ -356,7 +355,7 @@ class LookupOperation : public TensorOperation {
std::shared_ptr<Vocab> vocab_;
std::string unknown_token_;
int32_t default_id_;
DataType data_type_;
std::string data_type_;
};

class NgramOperation : public TensorOperation {
@@ -439,7 +438,7 @@ class SentencePieceTokenizerOperation : public TensorOperation {

SentencePieceTokenizerOperation(const std::string &vocab_path, SPieceTokenizerOutType out_type);

~SentencePieceTokenizerOperation() = default;
~SentencePieceTokenizerOperation();

std::shared_ptr<TensorOp> Build() override;

@@ -473,7 +472,7 @@ class SlidingWindowOperation : public TensorOperation {

class ToNumberOperation : public TensorOperation {
public:
explicit ToNumberOperation(DataType data_type);
explicit ToNumberOperation(std::string data_type);

~ToNumberOperation() = default;

@@ -484,7 +483,7 @@ class ToNumberOperation : public TensorOperation {
std::string Name() const override { return kToNumberOperation; }

private:
DataType data_type_;
std::string data_type_;
};

class TruncateSequencePairOperation : public TensorOperation {


+ 0
- 1
mindspore/ccsrc/minddata/dataset/include/type_id.h View File

@@ -16,7 +16,6 @@
#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_TYPEID_H_
#define MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_TYPEID_H_

#include "minddata/dataset/core/data_type.h"
#include "mindspore/core/ir/dtype/type_id.h"

namespace mindspore {


+ 2
- 1
mindspore/ccsrc/minddata/dataset/text/sentence_piece_vocab.h View File

@@ -22,10 +22,11 @@
#include <vector>
#include <unordered_map>
#include "minddata/dataset/util/status.h"
#include "minddata/dataset/include/constants.h"

namespace mindspore {
namespace dataset {
enum class SentencePieceModel { kUnigram = 0, kBpe = 1, kChar = 2, kWord = 3 };
class SentencePieceVocab {
public:
static Status BuildFromFile(const std::vector<std::string> &path_list, const int vocab_size,


+ 1
- 0
tests/ut/cpp/dataset/build_vocab_test.cc View File

@@ -22,6 +22,7 @@
#include "common/common.h"
#include "minddata/dataset/include/datasets.h"
#include "minddata/dataset/include/status.h"
#include "minddata/dataset/text/vocab.h"

using mindspore::dataset::Tensor;
using mindspore::dataset::Status;


+ 0
- 3
tests/ut/cpp/dataset/c_api_cache_test.cc View File

@@ -17,8 +17,6 @@
#include "minddata/dataset/include/datasets.h"
#include "minddata/dataset/include/vision.h"

#include "minddata/dataset/engine/ir/datasetops/source/csv_node.h"

using namespace mindspore::dataset;

// Helper function to get the session id from SESSION_ID env variable
@@ -28,7 +26,6 @@ class MindDataTestCacheOp : public UT::DatasetOpTesting {
public:
void SetUp() override {
DatasetOpTesting::SetUp();
GlobalInit();
}
};



+ 1
- 0
tests/ut/cpp/dataset/c_api_dataset_ops_test.cc View File

@@ -15,6 +15,7 @@
*/
#include "common/common.h"
#include "minddata/dataset/core/tensor_row.h"
#include "minddata/dataset/engine/ir/datasetops/dataset_node.h"
#include "minddata/dataset/include/datasets.h"
#include "minddata/dataset/include/vision.h"



+ 1
- 0
tests/ut/cpp/dataset/c_api_samplers_test.cc View File

@@ -14,6 +14,7 @@
* limitations under the License.
*/
#include "common/common.h"
#include "minddata/dataset/engine/datasetops/source/sampler/sampler.h"
#include "minddata/dataset/include/datasets.h"

using namespace mindspore::dataset;


+ 3
- 2
tests/ut/cpp/dataset/c_api_text_sentence_piece_vocab_test.cc View File

@@ -18,11 +18,12 @@
#include <string>

#include "common/common.h"
#include "minddata/dataset/core/constants.h"
#include "minddata/dataset/include/constants.h"
#include "minddata/dataset/include/datasets.h"
#include "minddata/dataset/include/status.h"
#include "minddata/dataset/include/transforms.h"
#include "minddata/dataset/include/text.h"
#include "minddata/dataset/include/transforms.h"
#include "minddata/dataset/text/sentence_piece_vocab.h"

using namespace mindspore::dataset;
using mindspore::dataset::SentencePieceModel;


+ 12
- 12
tests/ut/cpp/dataset/c_api_text_test.cc View File

@@ -21,11 +21,11 @@
#include "minddata/dataset/include/config.h"
#include "minddata/dataset/include/datasets.h"
#include "minddata/dataset/include/status.h"
#include "minddata/dataset/include/transforms.h"
#include "minddata/dataset/include/text.h"
#include "minddata/dataset/include/transforms.h"
#include "minddata/dataset/text/vocab.h"

using namespace mindspore::dataset;
using mindspore::dataset::DataType;
using mindspore::dataset::ShuffleMode;
using mindspore::dataset::Status;
using mindspore::dataset::Tensor;
@@ -1011,7 +1011,7 @@ TEST_F(MindDataTestPipeline, TestToNumberSuccess1) {
EXPECT_NE(ds, nullptr);

// Create ToNumber operation on ds
std::shared_ptr<TensorOperation> to_number = text::ToNumber(DataType("int64"));
std::shared_ptr<TensorOperation> to_number = text::ToNumber("int64");
EXPECT_NE(to_number, nullptr);

// Create a Map operation on ds
@@ -1064,7 +1064,7 @@ TEST_F(MindDataTestPipeline, TestToNumberSuccess2) {
EXPECT_NE(ds, nullptr);

// Create ToNumber operation on ds
std::shared_ptr<TensorOperation> to_number = text::ToNumber(DataType("float64"));
std::shared_ptr<TensorOperation> to_number = text::ToNumber("float64");
EXPECT_NE(to_number, nullptr);

// Create a Map operation on ds
@@ -1117,7 +1117,7 @@ TEST_F(MindDataTestPipeline, TestToNumberFail1) {
EXPECT_NE(ds, nullptr);

// Create ToNumber operation on ds
std::shared_ptr<TensorOperation> to_number = text::ToNumber(DataType("int8"));
std::shared_ptr<TensorOperation> to_number = text::ToNumber("int8");
EXPECT_NE(to_number, nullptr);

// Create a Map operation on ds
@@ -1167,7 +1167,7 @@ TEST_F(MindDataTestPipeline, TestToNumberFail2) {
EXPECT_NE(ds, nullptr);

// Create ToNumber operation on ds
std::shared_ptr<TensorOperation> to_number = text::ToNumber(DataType("float16"));
std::shared_ptr<TensorOperation> to_number = text::ToNumber("float16");
EXPECT_NE(to_number, nullptr);

// Create a Map operation on ds
@@ -1213,7 +1213,7 @@ TEST_F(MindDataTestPipeline, TestToNumberFail3) {
EXPECT_NE(ds, nullptr);

// Create ToNumber operation on ds
std::shared_ptr<TensorOperation> to_number = text::ToNumber(DataType("int64"));
std::shared_ptr<TensorOperation> to_number = text::ToNumber("int64");
EXPECT_NE(to_number, nullptr);

// Create a Map operation on ds
@@ -1246,7 +1246,7 @@ TEST_F(MindDataTestPipeline, TestToNumberFail3) {

TEST_F(MindDataTestPipeline, TestToNumberFail4) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestToNumberFail4.";
// Test ToNumber with non numerical DataType
// Test ToNumber with non numerical data type

std::string data_file = datasets_root_path_ + "/testTokenizerData/to_number.txt";

@@ -1255,15 +1255,15 @@ TEST_F(MindDataTestPipeline, TestToNumberFail4) {
EXPECT_NE(ds, nullptr);

// Create ToNumber operation on ds
std::shared_ptr<TensorOperation> to_number1 = text::ToNumber(DataType("string"));
std::shared_ptr<TensorOperation> to_number1 = text::ToNumber("string");

// Expect failure: invalid parameter with non numerical DataType
// Expect failure: invalid parameter with non numerical data type
EXPECT_EQ(to_number1, nullptr);

// Create ToNumber operation on ds
std::shared_ptr<TensorOperation> to_number2 = text::ToNumber(DataType("bool"));
std::shared_ptr<TensorOperation> to_number2 = text::ToNumber("bool");

// Expect failure: invalid parameter with non numerical DataType
// Expect failure: invalid parameter with non numerical data type
EXPECT_EQ(to_number2, nullptr);
}



+ 8
- 7
tests/ut/cpp/dataset/c_api_text_vocab_test.cc View File

@@ -20,8 +20,9 @@
#include "common/common.h"
#include "minddata/dataset/include/datasets.h"
#include "minddata/dataset/include/status.h"
#include "minddata/dataset/include/transforms.h"
#include "minddata/dataset/include/text.h"
#include "minddata/dataset/include/transforms.h"
#include "minddata/dataset/text/vocab.h"

using namespace mindspore::dataset;
using mindspore::dataset::DataType;
@@ -49,7 +50,7 @@ TEST_F(MindDataTestPipeline, TestVocabLookupOp) {
EXPECT_EQ(s, Status::OK());

// Create Lookup operation on ds
std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, "<unk>", DataType("int32"));
std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, "<unk>", "int32");
EXPECT_NE(lookup, nullptr);

// Create Map operation on ds
@@ -93,7 +94,7 @@ TEST_F(MindDataTestPipeline, TestVocabLookupOpEmptyString) {
EXPECT_EQ(s, Status::OK());

// Create Lookup operation on ds
std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, "", DataType("int32"));
std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, "", "int32");
EXPECT_NE(lookup, nullptr);

// Create Map operation on ds
@@ -137,7 +138,7 @@ TEST_F(MindDataTestPipeline, TestVocabLookupOpFail1) {

// Create lookup op for ds
// Expected failure: "<unk>" is not a word of vocab
std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, "<unk>", DataType("int32"));
std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, "<unk>", "int32");
EXPECT_EQ(lookup, nullptr);
}

@@ -148,7 +149,7 @@ TEST_F(MindDataTestPipeline, TestVocabLookupOpFail2) {

// Create lookup op
// Expected failure: vocab is null
std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, "", DataType("int32"));
std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, "", "int32");
EXPECT_EQ(lookup, nullptr);
}

@@ -170,7 +171,7 @@ TEST_F(MindDataTestPipeline, TestVocabFromDataset) {
EXPECT_EQ(home_index, 4);

// Create Lookup operation on ds
std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, "<unk>", DataType("int32"));
std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, "<unk>", "int32");
EXPECT_NE(lookup, nullptr);

// Create Map operation on ds
@@ -324,7 +325,7 @@ TEST_F(MindDataTestPipeline, TestVocabFromDatasetInt64) {
EXPECT_EQ(home_index, 2);

// Create Lookup operation on ds
std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, "home", DataType("int64"));
std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, "home", "int64");
EXPECT_NE(lookup, nullptr);

// Create Map operation on ds


+ 1
- 0
tests/ut/cpp/dataset/ir_callback_test.cc View File

@@ -21,6 +21,7 @@
#include "minddata/dataset/callback/ds_callback.h"
#include "minddata/dataset/core/client.h"
#include "minddata/dataset/engine/datasetops/source/random_data_op.h"
#include "minddata/dataset/engine/tree_adapter.h"
#include "minddata/dataset/include/datasets.h"
#include "minddata/dataset/include/transforms.h"
#include "minddata/dataset/kernels/data/no_op.h"


Loading…
Cancel
Save