From: @mhmotallebi Reviewed-by: Signed-off-by:tags/v1.2.0-rc1
| @@ -17,6 +17,7 @@ | |||
| #include "minddata/dataset/core/config_manager.h" | |||
| #include "minddata/dataset/core/global_context.h" | |||
| #include "minddata/dataset/include/config.h" | |||
| #include "minddata/dataset/util/log_adapter.h" | |||
| #include "minddata/dataset/util/status.h" | |||
| namespace mindspore { | |||
| @@ -19,16 +19,33 @@ | |||
| #include <fstream> | |||
| #include <unordered_set> | |||
| #include <utility> | |||
| #include "minddata/dataset/engine/runtime_context.h" | |||
| #include "minddata/dataset/include/samplers.h" | |||
| #include "minddata/dataset/include/transforms.h" | |||
| #include "minddata/dataset/util/path.h" | |||
| #include "minddata/dataset/util/status.h" | |||
| #include "minddata/dataset/core/client.h" | |||
| #include "minddata/dataset/engine/consumers/tree_consumer.h" | |||
| #include "minddata/dataset/kernels/c_func_op.h" | |||
| #include "minddata/dataset/kernels/tensor_op.h" | |||
| #ifndef ENABLE_ANDROID | |||
| #include "minddata/dataset/engine/ir/cache/dataset_cache_impl.h" | |||
| #endif | |||
| #ifndef ENABLE_ANDROID | |||
| #include "minddata/dataset/text/sentence_piece_vocab.h" | |||
| #include "minddata/dataset/text/vocab.h" | |||
| #endif | |||
| // Sampler headers (in alphabetical order) | |||
| #include "minddata/dataset/engine/datasetops/source/sampler/sampler.h" | |||
| #include "minddata/dataset/engine/ir/datasetops/dataset_node.h" | |||
| // IR non-leaf nodes | |||
| #include "minddata/dataset/engine/ir/datasetops/batch_node.h" | |||
| #ifndef ENABLE_ANDROID | |||
| @@ -57,7 +74,6 @@ | |||
| #endif | |||
| #include "minddata/dataset/core/config_manager.h" | |||
| #include "minddata/dataset/util/path.h" | |||
| #include "minddata/dataset/util/random.h" | |||
| #include "minddata/dataset/util/services.h" | |||
| @@ -939,6 +955,7 @@ TFRecordDataset::TFRecordDataset(const std::vector<std::string> &dataset_files, | |||
| shard_id, shard_equal_rows, cache); | |||
| ir_node_ = std::static_pointer_cast<DatasetNode>(ds); | |||
| } | |||
| #endif | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| @@ -16,11 +16,15 @@ | |||
| #include "minddata/dataset/include/iterator.h" | |||
| #include "minddata/dataset/core/client.h" | |||
| #include "minddata/dataset/engine/consumers/tree_consumer.h" | |||
| #include "minddata/dataset/engine/runtime_context.h" | |||
| #include "minddata/dataset/include/datasets.h" | |||
| namespace mindspore { | |||
| namespace dataset { | |||
| Iterator::Iterator() : consumer_(nullptr) {} | |||
| Iterator::~Iterator() { Stop(); } | |||
| // Get the next row from the data pipeline. | |||
| bool Iterator::GetNextRow(TensorMap *row) { | |||
| Status rc = consumer_->GetNextAsMap(row); | |||
| @@ -23,6 +23,8 @@ | |||
| #include "minddata/dataset/core/constants.h" | |||
| #include "minddata/dataset/core/global_context.h" | |||
| #include "minddata/dataset/include/datasets.h" | |||
| #include "minddata/dataset/text/sentence_piece_vocab.h" | |||
| // IR non-leaf nodes | |||
| #include "minddata/dataset/engine/ir/datasetops/batch_node.h" | |||
| #include "minddata/dataset/engine/ir/datasetops/concat_node.h" | |||
| @@ -21,6 +21,7 @@ | |||
| #include "minddata/dataset/api/python/pybind_register.h" | |||
| #include "minddata/dataset/text/vocab.h" | |||
| #include "minddata/dataset/text/sentence_piece_vocab.h" | |||
| #include "minddata/dataset/include/constants.h" | |||
| namespace mindspore { | |||
| namespace dataset { | |||
| @@ -39,6 +39,7 @@ | |||
| #include "minddata/dataset/text/kernels/unicode_script_tokenizer_op.h" | |||
| #include "minddata/dataset/text/kernels/whitespace_tokenizer_op.h" | |||
| #endif | |||
| #include "minddata/dataset/core/data_type.h" | |||
| #include "minddata/dataset/util/path.h" | |||
| namespace mindspore { | |||
| @@ -87,7 +88,7 @@ std::shared_ptr<JiebaTokenizerOperation> JiebaTokenizer(const std::string &hmm_p | |||
| } | |||
| std::shared_ptr<LookupOperation> Lookup(const std::shared_ptr<Vocab> &vocab, const std::string &unknown_token, | |||
| const DataType &data_type) { | |||
| const std::string &data_type) { | |||
| auto op = std::make_shared<LookupOperation>(vocab, unknown_token, data_type); | |||
| return op->ValidateParams() ? op : nullptr; | |||
| @@ -142,7 +143,7 @@ std::shared_ptr<SlidingWindowOperation> SlidingWindow(const int32_t width, const | |||
| return op->ValidateParams() ? op : nullptr; | |||
| } | |||
| std::shared_ptr<ToNumberOperation> ToNumber(const DataType data_type) { | |||
| std::shared_ptr<ToNumberOperation> ToNumber(const std::string &data_type) { | |||
| auto op = std::make_shared<ToNumberOperation>(data_type); | |||
| return op->ValidateParams() ? op : nullptr; | |||
| @@ -200,6 +201,19 @@ Status ValidateTokenizerDirParam(const std::string &tokenizer_name, const std::s | |||
| return Status::OK(); | |||
| } | |||
| // Helper functions to help validate data type passed by user | |||
| bool IsTypeNumeric(const std::string &data_type) { | |||
| if (data_type == "int8" || data_type == "uint8" || data_type == "int16" || data_type == "uint16" || | |||
| data_type == "int32" || data_type == "uint32" || data_type == "int64" || data_type == "uint64" || | |||
| data_type == "float16" || data_type == "float32" || data_type == "float64") | |||
| return true; | |||
| return false; | |||
| } | |||
| bool IsTypeBoolean(const std::string &data_type) { return data_type == "bool"; } | |||
| bool IsTypeString(const std::string &data_type) { return data_type == "string"; } | |||
| /* ####################################### Derived TensorOperation classes ################################# */ | |||
| // (In alphabetical order) | |||
| @@ -239,6 +253,8 @@ BertTokenizerOperation::BertTokenizerOperation(const std::shared_ptr<Vocab> &voc | |||
| preserve_unused_token_(preserve_unused_token), | |||
| with_offsets_(with_offsets) {} | |||
| BertTokenizerOperation::~BertTokenizerOperation() = default; | |||
| Status BertTokenizerOperation::ValidateParams() { | |||
| if (vocab_ == nullptr) { | |||
| std::string err_msg = "BertTokenizer: vocab object type is incorrect or null."; | |||
| @@ -303,9 +319,11 @@ std::shared_ptr<TensorOp> JiebaTokenizerOperation::Build() { | |||
| // LookupOperation | |||
| LookupOperation::LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::string &unknown_token, | |||
| const DataType &data_type) | |||
| const std::string &data_type) | |||
| : vocab_(vocab), unknown_token_(unknown_token), default_id_(Vocab::kNoTokenExists), data_type_(data_type) {} | |||
| LookupOperation::~LookupOperation() = default; | |||
| Status LookupOperation::ValidateParams() { | |||
| if (vocab_ == nullptr) { | |||
| std::string err_msg = "Lookup: vocab object type is incorrect or null."; | |||
| @@ -320,7 +338,7 @@ Status LookupOperation::ValidateParams() { | |||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||
| } | |||
| if (!data_type_.IsNumeric()) { | |||
| if (!IsTypeNumeric(data_type_)) { | |||
| std::string err_msg = "Lookup does not support a string to string mapping, data_type can only be numeric."; | |||
| MS_LOG(ERROR) << err_msg; | |||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||
| @@ -330,7 +348,7 @@ Status LookupOperation::ValidateParams() { | |||
| } | |||
| std::shared_ptr<TensorOp> LookupOperation::Build() { | |||
| std::shared_ptr<LookupOp> tensor_op = std::make_shared<LookupOp>(vocab_, default_id_, data_type_); | |||
| std::shared_ptr<LookupOp> tensor_op = std::make_shared<LookupOp>(vocab_, default_id_, DataType(data_type_)); | |||
| return tensor_op; | |||
| } | |||
| @@ -419,6 +437,8 @@ std::shared_ptr<TensorOp> RegexTokenizerOperation::Build() { | |||
| #endif | |||
| // SentencePieceTokenizerOperation | |||
| SentencePieceTokenizerOperation::~SentencePieceTokenizerOperation() = default; | |||
| SentencePieceTokenizerOperation::SentencePieceTokenizerOperation(const std::shared_ptr<SentencePieceVocab> &vocab, | |||
| SPieceTokenizerOutType out_type) | |||
| : vocab_(vocab), vocab_path_(std::string()), load_type_(SPieceTokenizerLoadType::kModel), out_type_(out_type) {} | |||
| @@ -482,11 +502,11 @@ std::shared_ptr<TensorOp> SlidingWindowOperation::Build() { | |||
| } | |||
| // ToNumberOperation | |||
| ToNumberOperation::ToNumberOperation(DataType data_type) : data_type_(data_type) {} | |||
| ToNumberOperation::ToNumberOperation(std::string data_type) : data_type_(data_type) {} | |||
| Status ToNumberOperation::ValidateParams() { | |||
| if (!data_type_.IsNumeric() || data_type_.IsBool()) { | |||
| std::string err_msg = "ToNumber : The parameter data_type must be a numeric type, got: " + data_type_.ToString(); | |||
| if (!IsTypeNumeric(data_type_) || IsTypeBoolean(data_type_)) { | |||
| std::string err_msg = "ToNumber : The parameter data_type must be a numeric type, got: " + data_type_; | |||
| MS_LOG(ERROR) << err_msg; | |||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||
| } | |||
| @@ -59,6 +59,9 @@ enum class SPieceTokenizerOutType { kString = 0, kInt = 1 }; | |||
| // Possible values for SPieceTokenizerLoadType | |||
| enum class SPieceTokenizerLoadType { kFile = 0, kModel = 1 }; | |||
| // Possible values for SentencePieceModel | |||
| enum class SentencePieceModel { kUnigram = 0, kBpe = 1, kChar = 2, kWord = 3 }; | |||
| // Possible values for NormalizeForm | |||
| enum class NormalizeForm { | |||
| kNone = 0, | |||
| @@ -19,6 +19,7 @@ | |||
| #include <memory> | |||
| #include <mutex> | |||
| #include "minddata/dataset/core/config_manager.h" | |||
| #include "minddata/dataset/core/constants.h" | |||
| #include "minddata/dataset/util/allocator.h" | |||
| #include "minddata/dataset/util/status.h" | |||
| @@ -27,7 +28,6 @@ namespace mindspore { | |||
| namespace dataset { | |||
| // forward declare | |||
| class MemoryPool; | |||
| class ConfigManager; | |||
| class Tensor; | |||
| class CVTensor; | |||
| @@ -22,8 +22,10 @@ | |||
| #include <utility> | |||
| #include <vector> | |||
| #include "minddata/dataset/engine/consumers/tree_consumer.h" | |||
| #include "minddata/dataset/engine/tree_adapter.h" | |||
| #include "minddata/dataset/engine/datasetops/device_queue_op.h" | |||
| #include "minddata/dataset/engine/opt/pre/getter_pass.h" | |||
| #include "minddata/dataset/engine/tree_adapter.h" | |||
| #include "minddata/mindrecord/include/shard_index_generator.h" | |||
| #ifndef ENABLE_ANDROID | |||
| #include "minddata/mindrecord/include/shard_header.h" | |||
| @@ -23,6 +23,7 @@ | |||
| #include <unordered_map> | |||
| #include <vector> | |||
| #include "minddata/dataset/engine/ir/datasetops/dataset_node.h" | |||
| #include "minddata/dataset/include/datasets.h" | |||
| namespace mindspore { | |||
| @@ -24,13 +24,26 @@ | |||
| #include <utility> | |||
| #include <vector> | |||
| #include "minddata/dataset/include/datasets.h" | |||
| #include "minddata/dataset/core/config_manager.h" | |||
| #include "minddata/dataset/engine/consumers/tree_consumer.h" | |||
| #include "minddata/dataset/engine/data_schema.h" | |||
| #include "minddata/dataset/engine/datasetops/filter_op.h" | |||
| #include "minddata/dataset/engine/datasetops/map_op/map_op.h" | |||
| #include "minddata/dataset/engine/datasetops/project_op.h" | |||
| #include "minddata/dataset/engine/datasetops/repeat_op.h" | |||
| #include "minddata/dataset/engine/datasetops/shuffle_op.h" | |||
| #include "minddata/dataset/engine/datasetops/skip_op.h" | |||
| #include "minddata/dataset/engine/datasetops/take_op.h" | |||
| #include "minddata/dataset/engine/ir/cache/dataset_cache.h" | |||
| #include "minddata/dataset/include/datasets.h" | |||
| #include "minddata/dataset/util/path.h" | |||
| #include "minddata/dataset/util/status.h" | |||
| namespace mindspore { | |||
| namespace dataset { | |||
| class Dataset; | |||
| class DatasetCache; | |||
| class SamplerObj; | |||
| class IRNodePass; | |||
| class DatasetSizeGetter; | |||
| @@ -22,6 +22,7 @@ | |||
| #include <string> | |||
| #include <vector> | |||
| #include "minddata/dataset/engine/datasetops/source/mindrecord_op.h" | |||
| #include "minddata/dataset/engine/ir/datasetops/dataset_node.h" | |||
| namespace mindspore { | |||
| @@ -24,6 +24,7 @@ | |||
| #include "minddata/dataset/engine/datasetops/source/random_data_op.h" | |||
| #include "minddata/dataset/util/random.h" | |||
| #include "minddata/dataset/util/status.h" | |||
| namespace mindspore { | |||
| namespace dataset { | |||
| @@ -22,7 +22,9 @@ | |||
| #include <utility> | |||
| #include <vector> | |||
| #include "minddata/dataset/engine/datasetops/source/sampler/sampler.h" | |||
| #include "minddata/dataset/engine/ir/datasetops/dataset_node.h" | |||
| #include "minddata/dataset/include/samplers.h" | |||
| namespace mindspore { | |||
| namespace dataset { | |||
| @@ -20,8 +20,6 @@ | |||
| #include <cstdint> | |||
| #include <string> | |||
| #include "minddata/dataset/util/log_adapter.h" | |||
| namespace mindspore { | |||
| namespace dataset { | |||
| @@ -59,6 +59,9 @@ enum class SPieceTokenizerOutType { kString = 0, kInt = 1 }; | |||
| // Possible values for SPieceTokenizerLoadType | |||
| enum class SPieceTokenizerLoadType { kFile = 0, kModel = 1 }; | |||
| // Possible values for SentencePieceModel | |||
| enum class SentencePieceModel { kUnigram = 0, kBpe = 1, kChar = 2, kWord = 3 }; | |||
| // Possible values for NormalizeForm | |||
| enum class NormalizeForm { | |||
| kNone = 0, | |||
| @@ -17,6 +17,7 @@ | |||
| #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_DATASETS_H_ | |||
| #define MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_DATASETS_H_ | |||
| #include <sys/stat.h> | |||
| #include <unistd.h> | |||
| #include <map> | |||
| #include <memory> | |||
| @@ -26,27 +27,18 @@ | |||
| #include <unordered_set> | |||
| #include <utility> | |||
| #include <vector> | |||
| #include "minddata/dataset/engine/ir/cache/dataset_cache.h" | |||
| #include "minddata/dataset/core/constants.h" | |||
| #include "minddata/dataset/engine/consumers/tree_consumer.h" | |||
| #include "minddata/dataset/engine/ir/datasetops/dataset_node.h" | |||
| #include "minddata/dataset/include/iterator.h" | |||
| #include "minddata/dataset/include/samplers.h" | |||
| #include "minddata/dataset/include/tensor.h" | |||
| #include "minddata/dataset/include/text.h" | |||
| #include "minddata/dataset/include/type_id.h" | |||
| #include "minddata/dataset/kernels/c_func_op.h" | |||
| #include "minddata/dataset/kernels/tensor_op.h" | |||
| #include "minddata/dataset/util/path.h" | |||
| #ifndef ENABLE_ANDROID | |||
| #include "minddata/dataset/text/sentence_piece_vocab.h" | |||
| #include "minddata/dataset/text/vocab.h" | |||
| #endif | |||
| namespace mindspore { | |||
| namespace dataset { | |||
| class Tensor; | |||
| class TensorRow; | |||
| class TensorShape; | |||
| class TreeAdapter; | |||
| class TreeGetters; | |||
| @@ -54,6 +46,7 @@ class TreeGetters; | |||
| class Vocab; | |||
| #endif | |||
| class DatasetCache; | |||
| class DatasetNode; | |||
| class Iterator; | |||
| @@ -77,12 +70,20 @@ class ConcatDataset; | |||
| class RenameDataset; | |||
| #endif | |||
| #ifndef ENABLE_ANDROID | |||
| class SentencePieceVocab; | |||
| enum class SentencePieceModel; | |||
| #endif | |||
| class DSCallback; | |||
| class RepeatDataset; | |||
| #ifndef ENABLE_ANDROID | |||
| class SkipDataset; | |||
| class TakeDataset; | |||
| class ZipDataset; | |||
| #endif | |||
| /// \class Dataset datasets.h | |||
| @@ -969,8 +970,12 @@ std::shared_ptr<TFRecordDataset> TFRecord(const std::vector<std::string> &datase | |||
| } else { | |||
| std::string schema_path = schema; | |||
| if (!schema_path.empty()) { | |||
| Path schema_file(schema_path); | |||
| if (!schema_file.Exists()) { | |||
| struct stat sb; | |||
| int rc = stat(common::SafeCStr(schema_path), &sb); | |||
| if (rc == -1 && errno != ENOENT) { | |||
| MS_LOG(WARNING) << "Unable to query the status of [" << schema_path << "]. Errno = " << errno << "."; | |||
| } | |||
| if (rc != 0) { | |||
| MS_LOG(ERROR) << "TFRecordDataset: schema path [" << schema_path << "] is invalid or does not exist."; | |||
| return nullptr; | |||
| } | |||
| @@ -14,14 +14,14 @@ | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_API_DETENSOR_H_ | |||
| #define MINDSPORE_CCSRC_MINDDATA_DATASET_API_DETENSOR_H_ | |||
| #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_DETENSOR_H_ | |||
| #define MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_DETENSOR_H_ | |||
| #include <string> | |||
| #include <vector> | |||
| #include <memory> | |||
| #include "include/ms_tensor.h" | |||
| #include "minddata/dataset/include/status.h" | |||
| #include "minddata/dataset/include/tensor.h" | |||
| #include "minddata/dataset/util/status.h" | |||
| namespace mindspore { | |||
| namespace tensor { | |||
| class DETensor : public mindspore::tensor::MSTensor { | |||
| @@ -79,4 +79,4 @@ class DETensor : public mindspore::tensor::MSTensor { | |||
| }; | |||
| } // namespace tensor | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_API_DETENSOR_H_ | |||
| #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_DETENSOR_H_ | |||
| @@ -14,12 +14,13 @@ | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef DATASET_API_EXECUTE_H_ | |||
| #define DATASET_API_EXECUTE_H_ | |||
| #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_EXECUTE_H_ | |||
| #define MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_EXECUTE_H_ | |||
| #include <vector> | |||
| #include <memory> | |||
| #include "minddata/dataset/core/constants.h" | |||
| #include "minddata/dataset/include/constants.h" | |||
| #ifdef ENABLE_ANDROID | |||
| #include "minddata/dataset/include/de_tensor.h" | |||
| #endif | |||
| @@ -55,4 +56,4 @@ class Execute { | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| #endif // DATASET_API_EXECUTE_H_ | |||
| #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_EXECUTE_H_ | |||
| @@ -21,7 +21,6 @@ | |||
| #include <string> | |||
| #include <unordered_map> | |||
| #include <vector> | |||
| #include "minddata/dataset/engine/runtime_context.h" | |||
| #include "minddata/dataset/include/status.h" | |||
| namespace mindspore { | |||
| @@ -45,10 +44,10 @@ using TensorVec = std::vector<std::shared_ptr<Tensor>>; | |||
| class Iterator { | |||
| public: | |||
| /// \brief Constructor | |||
| Iterator() : consumer_(nullptr) {} | |||
| Iterator(); | |||
| /// \brief Destructor | |||
| ~Iterator() { Stop(); } | |||
| ~Iterator(); | |||
| /// \brief Method for building and launching the pipeline. | |||
| /// \param[in] ops - a vector of DatasetOp in the data pipeline. | |||
| @@ -21,10 +21,11 @@ | |||
| #include <string> | |||
| #include <vector> | |||
| #include "minddata/dataset/util/status.h" | |||
| #include "minddata/dataset/include/status.h" | |||
| #ifndef ENABLE_ANDROID | |||
| #include "minddata/mindrecord/include/shard_column.h" | |||
| #include "minddata/mindrecord/include/shard_error.h" | |||
| #include "minddata/mindrecord/include/shard_operator.h" | |||
| #include "minddata/mindrecord/include/shard_reader.h" | |||
| #endif | |||
| @@ -51,6 +51,13 @@ namespace dataset { | |||
| } \ | |||
| } while (false) | |||
| #define CHECK_FAIL_RETURN_SYNTAX_ERROR(_condition, _e) \ | |||
| do { \ | |||
| if (!(_condition)) { \ | |||
| return Status(StatusCode::kSyntaxError, __LINE__, __FILE__, _e); \ | |||
| } \ | |||
| } while (false) | |||
| #define RETURN_UNEXPECTED_IF_NULL(_ptr) \ | |||
| do { \ | |||
| if ((_ptr) == nullptr) { \ | |||
| @@ -71,6 +78,15 @@ namespace dataset { | |||
| return Status(StatusCode::kSyntaxError, __LINE__, __FILE__, _e); \ | |||
| } while (false) | |||
| #define RETURN_SECOND_IF_ERROR(_s, _r) \ | |||
| do { \ | |||
| Status __rc = (_s); \ | |||
| if (__rc.IsError()) { \ | |||
| MS_LOG(ERROR) << __rc; \ | |||
| return _r; \ | |||
| } \ | |||
| } while (false) | |||
| enum class StatusCode : char { | |||
| kOK = 0, | |||
| kOutOfMemory = 1, | |||
| @@ -151,6 +167,12 @@ class Status { | |||
| StatusCode code_; | |||
| std::string err_msg_; | |||
| }; | |||
| #if !defined(_WIN32) && !defined(_WIN64) | |||
| const float MAX_MEMORY_USAGE_THRESHOLD = 0.95; | |||
| float GetMemoryUsage(); | |||
| #endif | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_UTIL_STATUS_H_ | |||
| @@ -22,18 +22,16 @@ | |||
| #include <utility> | |||
| #include <vector> | |||
| #include "mindspore/ccsrc/minddata/dataset/core/data_type.h" | |||
| #include "minddata/dataset/core/constants.h" | |||
| #include "minddata/dataset/include/constants.h" | |||
| #include "minddata/dataset/include/status.h" | |||
| #include "minddata/dataset/include/transforms.h" | |||
| #include "minddata/dataset/util/status.h" | |||
| #include "minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h" | |||
| #include "minddata/dataset/text/sentence_piece_vocab.h" | |||
| #include "minddata/dataset/text/vocab.h" | |||
| namespace mindspore { | |||
| namespace dataset { | |||
| class Vocab; | |||
| class SentencePieceVocab; | |||
| // Transform operations for text | |||
| namespace text { | |||
| @@ -146,10 +144,11 @@ std::shared_ptr<JiebaTokenizerOperation> JiebaTokenizer(const std::string &hmm_p | |||
| /// \param[in] vocab a Vocab object. | |||
| /// \param[in] unknown_token word to use for lookup if the word being looked up is out of Vocabulary (oov). | |||
| /// If unknown_token is oov, runtime error will be thrown. | |||
| /// \param[in] DataType type of the tensor after lookup, typically int32. | |||
| /// \param[in] data_type type of the tensor after lookup, typically int32. | |||
| /// \return Shared pointer to the current TensorOperation. | |||
| std::shared_ptr<LookupOperation> Lookup(const std::shared_ptr<Vocab> &vocab, const std::string &unknown_token, | |||
| const mindspore::dataset::DataType &data_type = DataType("int32")); | |||
| const std::string &data_type = "int32"); | |||
| /// \brief TensorOp to generate n-gram from a 1-D string Tensor. | |||
| /// \param[in] ngrams ngrams is a vector of positive integers. For example, if ngrams={4, 3}, then the result | |||
| @@ -226,9 +225,9 @@ std::shared_ptr<SlidingWindowOperation> SlidingWindow(const int32_t width, const | |||
| /// https://en.cppreference.com/w/cpp/string/basic_string/stof, | |||
| /// https://en.cppreference.com/w/cpp/string/basic_string/stoul, | |||
| /// except that any strings which represent negative numbers cannot be cast to an unsigned integer type. | |||
| /// \param[in] data_type DataType of the tensor to be casted to. Must be a numeric type. | |||
| /// \param[in] data_type of the tensor to be casted to. Must be a numeric type. | |||
| /// \return Shared pointer to the current TensorOperation. | |||
| std::shared_ptr<ToNumberOperation> ToNumber(const DataType data_type); | |||
| std::shared_ptr<ToNumberOperation> ToNumber(const std::string &data_type); | |||
| /// \brief Truncate a pair of rank-1 tensors such that the total length is less than max_length. | |||
| /// \param[in] max_length Maximum length required. | |||
| @@ -285,7 +284,7 @@ class BertTokenizerOperation : public TensorOperation { | |||
| bool keep_whitespace, const NormalizeForm normalize_form, bool preserve_unused_token, | |||
| bool with_offsets); | |||
| ~BertTokenizerOperation() = default; | |||
| ~BertTokenizerOperation(); | |||
| std::shared_ptr<TensorOp> Build() override; | |||
| @@ -342,9 +341,9 @@ class JiebaTokenizerOperation : public TensorOperation { | |||
| class LookupOperation : public TensorOperation { | |||
| public: | |||
| explicit LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::string &unknown_token, | |||
| const DataType &data_type); | |||
| const std::string &data_type); | |||
| ~LookupOperation() = default; | |||
| ~LookupOperation(); | |||
| std::shared_ptr<TensorOp> Build() override; | |||
| @@ -356,7 +355,7 @@ class LookupOperation : public TensorOperation { | |||
| std::shared_ptr<Vocab> vocab_; | |||
| std::string unknown_token_; | |||
| int32_t default_id_; | |||
| DataType data_type_; | |||
| std::string data_type_; | |||
| }; | |||
| class NgramOperation : public TensorOperation { | |||
| @@ -439,7 +438,7 @@ class SentencePieceTokenizerOperation : public TensorOperation { | |||
| SentencePieceTokenizerOperation(const std::string &vocab_path, SPieceTokenizerOutType out_type); | |||
| ~SentencePieceTokenizerOperation() = default; | |||
| ~SentencePieceTokenizerOperation(); | |||
| std::shared_ptr<TensorOp> Build() override; | |||
| @@ -473,7 +472,7 @@ class SlidingWindowOperation : public TensorOperation { | |||
| class ToNumberOperation : public TensorOperation { | |||
| public: | |||
| explicit ToNumberOperation(DataType data_type); | |||
| explicit ToNumberOperation(std::string data_type); | |||
| ~ToNumberOperation() = default; | |||
| @@ -484,7 +483,7 @@ class ToNumberOperation : public TensorOperation { | |||
| std::string Name() const override { return kToNumberOperation; } | |||
| private: | |||
| DataType data_type_; | |||
| std::string data_type_; | |||
| }; | |||
| class TruncateSequencePairOperation : public TensorOperation { | |||
| @@ -16,7 +16,6 @@ | |||
| #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_TYPEID_H_ | |||
| #define MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_TYPEID_H_ | |||
| #include "minddata/dataset/core/data_type.h" | |||
| #include "mindspore/core/ir/dtype/type_id.h" | |||
| namespace mindspore { | |||
| @@ -22,10 +22,11 @@ | |||
| #include <vector> | |||
| #include <unordered_map> | |||
| #include "minddata/dataset/util/status.h" | |||
| #include "minddata/dataset/include/constants.h" | |||
| namespace mindspore { | |||
| namespace dataset { | |||
| enum class SentencePieceModel { kUnigram = 0, kBpe = 1, kChar = 2, kWord = 3 }; | |||
| class SentencePieceVocab { | |||
| public: | |||
| static Status BuildFromFile(const std::vector<std::string> &path_list, const int vocab_size, | |||
| @@ -22,6 +22,7 @@ | |||
| #include "common/common.h" | |||
| #include "minddata/dataset/include/datasets.h" | |||
| #include "minddata/dataset/include/status.h" | |||
| #include "minddata/dataset/text/vocab.h" | |||
| using mindspore::dataset::Tensor; | |||
| using mindspore::dataset::Status; | |||
| @@ -17,8 +17,6 @@ | |||
| #include "minddata/dataset/include/datasets.h" | |||
| #include "minddata/dataset/include/vision.h" | |||
| #include "minddata/dataset/engine/ir/datasetops/source/csv_node.h" | |||
| using namespace mindspore::dataset; | |||
| // Helper function to get the session id from SESSION_ID env variable | |||
| @@ -28,7 +26,6 @@ class MindDataTestCacheOp : public UT::DatasetOpTesting { | |||
| public: | |||
| void SetUp() override { | |||
| DatasetOpTesting::SetUp(); | |||
| GlobalInit(); | |||
| } | |||
| }; | |||
| @@ -15,6 +15,7 @@ | |||
| */ | |||
| #include "common/common.h" | |||
| #include "minddata/dataset/core/tensor_row.h" | |||
| #include "minddata/dataset/engine/ir/datasetops/dataset_node.h" | |||
| #include "minddata/dataset/include/datasets.h" | |||
| #include "minddata/dataset/include/vision.h" | |||
| @@ -14,6 +14,7 @@ | |||
| * limitations under the License. | |||
| */ | |||
| #include "common/common.h" | |||
| #include "minddata/dataset/engine/datasetops/source/sampler/sampler.h" | |||
| #include "minddata/dataset/include/datasets.h" | |||
| using namespace mindspore::dataset; | |||
| @@ -18,11 +18,12 @@ | |||
| #include <string> | |||
| #include "common/common.h" | |||
| #include "minddata/dataset/core/constants.h" | |||
| #include "minddata/dataset/include/constants.h" | |||
| #include "minddata/dataset/include/datasets.h" | |||
| #include "minddata/dataset/include/status.h" | |||
| #include "minddata/dataset/include/transforms.h" | |||
| #include "minddata/dataset/include/text.h" | |||
| #include "minddata/dataset/include/transforms.h" | |||
| #include "minddata/dataset/text/sentence_piece_vocab.h" | |||
| using namespace mindspore::dataset; | |||
| using mindspore::dataset::SentencePieceModel; | |||
| @@ -21,11 +21,11 @@ | |||
| #include "minddata/dataset/include/config.h" | |||
| #include "minddata/dataset/include/datasets.h" | |||
| #include "minddata/dataset/include/status.h" | |||
| #include "minddata/dataset/include/transforms.h" | |||
| #include "minddata/dataset/include/text.h" | |||
| #include "minddata/dataset/include/transforms.h" | |||
| #include "minddata/dataset/text/vocab.h" | |||
| using namespace mindspore::dataset; | |||
| using mindspore::dataset::DataType; | |||
| using mindspore::dataset::ShuffleMode; | |||
| using mindspore::dataset::Status; | |||
| using mindspore::dataset::Tensor; | |||
| @@ -1011,7 +1011,7 @@ TEST_F(MindDataTestPipeline, TestToNumberSuccess1) { | |||
| EXPECT_NE(ds, nullptr); | |||
| // Create ToNumber operation on ds | |||
| std::shared_ptr<TensorOperation> to_number = text::ToNumber(DataType("int64")); | |||
| std::shared_ptr<TensorOperation> to_number = text::ToNumber("int64"); | |||
| EXPECT_NE(to_number, nullptr); | |||
| // Create a Map operation on ds | |||
| @@ -1064,7 +1064,7 @@ TEST_F(MindDataTestPipeline, TestToNumberSuccess2) { | |||
| EXPECT_NE(ds, nullptr); | |||
| // Create ToNumber operation on ds | |||
| std::shared_ptr<TensorOperation> to_number = text::ToNumber(DataType("float64")); | |||
| std::shared_ptr<TensorOperation> to_number = text::ToNumber("float64"); | |||
| EXPECT_NE(to_number, nullptr); | |||
| // Create a Map operation on ds | |||
| @@ -1117,7 +1117,7 @@ TEST_F(MindDataTestPipeline, TestToNumberFail1) { | |||
| EXPECT_NE(ds, nullptr); | |||
| // Create ToNumber operation on ds | |||
| std::shared_ptr<TensorOperation> to_number = text::ToNumber(DataType("int8")); | |||
| std::shared_ptr<TensorOperation> to_number = text::ToNumber("int8"); | |||
| EXPECT_NE(to_number, nullptr); | |||
| // Create a Map operation on ds | |||
| @@ -1167,7 +1167,7 @@ TEST_F(MindDataTestPipeline, TestToNumberFail2) { | |||
| EXPECT_NE(ds, nullptr); | |||
| // Create ToNumber operation on ds | |||
| std::shared_ptr<TensorOperation> to_number = text::ToNumber(DataType("float16")); | |||
| std::shared_ptr<TensorOperation> to_number = text::ToNumber("float16"); | |||
| EXPECT_NE(to_number, nullptr); | |||
| // Create a Map operation on ds | |||
| @@ -1213,7 +1213,7 @@ TEST_F(MindDataTestPipeline, TestToNumberFail3) { | |||
| EXPECT_NE(ds, nullptr); | |||
| // Create ToNumber operation on ds | |||
| std::shared_ptr<TensorOperation> to_number = text::ToNumber(DataType("int64")); | |||
| std::shared_ptr<TensorOperation> to_number = text::ToNumber("int64"); | |||
| EXPECT_NE(to_number, nullptr); | |||
| // Create a Map operation on ds | |||
| @@ -1246,7 +1246,7 @@ TEST_F(MindDataTestPipeline, TestToNumberFail3) { | |||
| TEST_F(MindDataTestPipeline, TestToNumberFail4) { | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestToNumberFail4."; | |||
| // Test ToNumber with non numerical DataType | |||
| // Test ToNumber with non numerical data type | |||
| std::string data_file = datasets_root_path_ + "/testTokenizerData/to_number.txt"; | |||
| @@ -1255,15 +1255,15 @@ TEST_F(MindDataTestPipeline, TestToNumberFail4) { | |||
| EXPECT_NE(ds, nullptr); | |||
| // Create ToNumber operation on ds | |||
| std::shared_ptr<TensorOperation> to_number1 = text::ToNumber(DataType("string")); | |||
| std::shared_ptr<TensorOperation> to_number1 = text::ToNumber("string"); | |||
| // Expect failure: invalid parameter with non numerical DataType | |||
| // Expect failure: invalid parameter with non numerical data type | |||
| EXPECT_EQ(to_number1, nullptr); | |||
| // Create ToNumber operation on ds | |||
| std::shared_ptr<TensorOperation> to_number2 = text::ToNumber(DataType("bool")); | |||
| std::shared_ptr<TensorOperation> to_number2 = text::ToNumber("bool"); | |||
| // Expect failure: invalid parameter with non numerical DataType | |||
| // Expect failure: invalid parameter with non numerical data type | |||
| EXPECT_EQ(to_number2, nullptr); | |||
| } | |||
| @@ -20,8 +20,9 @@ | |||
| #include "common/common.h" | |||
| #include "minddata/dataset/include/datasets.h" | |||
| #include "minddata/dataset/include/status.h" | |||
| #include "minddata/dataset/include/transforms.h" | |||
| #include "minddata/dataset/include/text.h" | |||
| #include "minddata/dataset/include/transforms.h" | |||
| #include "minddata/dataset/text/vocab.h" | |||
| using namespace mindspore::dataset; | |||
| using mindspore::dataset::DataType; | |||
| @@ -49,7 +50,7 @@ TEST_F(MindDataTestPipeline, TestVocabLookupOp) { | |||
| EXPECT_EQ(s, Status::OK()); | |||
| // Create Lookup operation on ds | |||
| std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, "<unk>", DataType("int32")); | |||
| std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, "<unk>", "int32"); | |||
| EXPECT_NE(lookup, nullptr); | |||
| // Create Map operation on ds | |||
| @@ -93,7 +94,7 @@ TEST_F(MindDataTestPipeline, TestVocabLookupOpEmptyString) { | |||
| EXPECT_EQ(s, Status::OK()); | |||
| // Create Lookup operation on ds | |||
| std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, "", DataType("int32")); | |||
| std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, "", "int32"); | |||
| EXPECT_NE(lookup, nullptr); | |||
| // Create Map operation on ds | |||
| @@ -137,7 +138,7 @@ TEST_F(MindDataTestPipeline, TestVocabLookupOpFail1) { | |||
| // Create lookup op for ds | |||
| // Expected failure: "<unk>" is not a word of vocab | |||
| std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, "<unk>", DataType("int32")); | |||
| std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, "<unk>", "int32"); | |||
| EXPECT_EQ(lookup, nullptr); | |||
| } | |||
| @@ -148,7 +149,7 @@ TEST_F(MindDataTestPipeline, TestVocabLookupOpFail2) { | |||
| // Create lookup op | |||
| // Expected failure: vocab is null | |||
| std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, "", DataType("int32")); | |||
| std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, "", "int32"); | |||
| EXPECT_EQ(lookup, nullptr); | |||
| } | |||
| @@ -170,7 +171,7 @@ TEST_F(MindDataTestPipeline, TestVocabFromDataset) { | |||
| EXPECT_EQ(home_index, 4); | |||
| // Create Lookup operation on ds | |||
| std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, "<unk>", DataType("int32")); | |||
| std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, "<unk>", "int32"); | |||
| EXPECT_NE(lookup, nullptr); | |||
| // Create Map operation on ds | |||
| @@ -324,7 +325,7 @@ TEST_F(MindDataTestPipeline, TestVocabFromDatasetInt64) { | |||
| EXPECT_EQ(home_index, 2); | |||
| // Create Lookup operation on ds | |||
| std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, "home", DataType("int64")); | |||
| std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, "home", "int64"); | |||
| EXPECT_NE(lookup, nullptr); | |||
| // Create Map operation on ds | |||
| @@ -21,6 +21,7 @@ | |||
| #include "minddata/dataset/callback/ds_callback.h" | |||
| #include "minddata/dataset/core/client.h" | |||
| #include "minddata/dataset/engine/datasetops/source/random_data_op.h" | |||
| #include "minddata/dataset/engine/tree_adapter.h" | |||
| #include "minddata/dataset/include/datasets.h" | |||
| #include "minddata/dataset/include/transforms.h" | |||
| #include "minddata/dataset/kernels/data/no_op.h" | |||