| @@ -26,4 +26,5 @@ add_library(cpp-API OBJECT | |||||
| iterator.cc | iterator.cc | ||||
| transforms.cc | transforms.cc | ||||
| samplers.cc | samplers.cc | ||||
| text.cc | |||||
| ) | ) | ||||
| @@ -34,6 +34,7 @@ | |||||
| #include "minddata/dataset/engine/datasetops/source/voc_op.h" | #include "minddata/dataset/engine/datasetops/source/voc_op.h" | ||||
| // Dataset operator headers (in alphabetical order) | // Dataset operator headers (in alphabetical order) | ||||
| #include "minddata/dataset/engine/datasetops/batch_op.h" | #include "minddata/dataset/engine/datasetops/batch_op.h" | ||||
| #include "minddata/dataset/engine/datasetops/build_vocab_op.h" | |||||
| #include "minddata/dataset/engine/datasetops/concat_op.h" | #include "minddata/dataset/engine/datasetops/concat_op.h" | ||||
| #include "minddata/dataset/engine/datasetops/map_op/map_op.h" | #include "minddata/dataset/engine/datasetops/map_op/map_op.h" | ||||
| #include "minddata/dataset/engine/datasetops/project_op.h" | #include "minddata/dataset/engine/datasetops/project_op.h" | ||||
| @@ -263,6 +264,37 @@ std::shared_ptr<BatchDataset> Dataset::Batch(int32_t batch_size, bool drop_remai | |||||
| return ds; | return ds; | ||||
| } | } | ||||
| // Function to create a Vocab from dataset | |||||
| std::shared_ptr<Vocab> Dataset::BuildVocab(const std::vector<std::string> &columns, | |||||
| const std::pair<int64_t, int64_t> &freq_range, int64_t top_k, | |||||
| const std::vector<std::string> &special_tokens, bool special_first) { | |||||
| auto vocab = std::make_shared<Vocab>(); | |||||
| auto ds = std::make_shared<BuildVocabDataset>(vocab, columns, freq_range, top_k, special_tokens, special_first); | |||||
| if (!ds->ValidateParams()) { | |||||
| return nullptr; | |||||
| } | |||||
| ds->children.push_back(shared_from_this()); | |||||
| // Run tree here to starting building vocab | |||||
| std::shared_ptr<Iterator> iter = ds->CreateIterator(); | |||||
| if (iter == nullptr) { | |||||
| MS_LOG(ERROR) << "Fail to run iterator in BuildVocab."; | |||||
| return nullptr; | |||||
| } | |||||
| // Finish building vocab by triggering GetNextRow | |||||
| std::unordered_map<std::string, std::shared_ptr<Tensor>> row; | |||||
| iter->GetNextRow(&row); | |||||
| if (vocab == nullptr) { | |||||
| MS_LOG(ERROR) << "Fail to build vocab."; | |||||
| return nullptr; | |||||
| } | |||||
| return vocab; | |||||
| } | |||||
| // Function to create a Concat dataset | // Function to create a Concat dataset | ||||
| std::shared_ptr<ConcatDataset> Dataset::Concat(const std::vector<std::shared_ptr<Dataset>> &datasets) { | std::shared_ptr<ConcatDataset> Dataset::Concat(const std::vector<std::shared_ptr<Dataset>> &datasets) { | ||||
| auto ds = std::make_shared<ConcatDataset>(datasets); | auto ds = std::make_shared<ConcatDataset>(datasets); | ||||
| @@ -1450,13 +1482,52 @@ std::vector<std::shared_ptr<DatasetOp>> BatchDataset::Build() { | |||||
| bool BatchDataset::ValidateParams() { | bool BatchDataset::ValidateParams() { | ||||
| if (batch_size_ <= 0) { | if (batch_size_ <= 0) { | ||||
| MS_LOG(ERROR) << "Batch: Batch size cannot be negative"; | |||||
| MS_LOG(ERROR) << "Batch: batch_size should be positive integer, but got: " << batch_size_; | |||||
| return false; | return false; | ||||
| } | } | ||||
| return true; | return true; | ||||
| } | } | ||||
| BuildVocabDataset::BuildVocabDataset(std::shared_ptr<Vocab> vocab, const std::vector<std::string> &columns, | |||||
| const std::pair<int64_t, int64_t> &freq_range, int64_t top_k, | |||||
| const std::vector<std::string> &special_tokens, bool special_first) | |||||
| : vocab_(vocab), | |||||
| columns_(columns), | |||||
| freq_range_(freq_range), | |||||
| top_k_(top_k), | |||||
| special_tokens_(special_tokens), | |||||
| special_first_(special_first) {} | |||||
| // Function to build BuildVocabDataset | |||||
| std::vector<std::shared_ptr<DatasetOp>> BuildVocabDataset::Build() { | |||||
| // A vector containing shared pointer to the Dataset Ops that this object will create | |||||
| std::vector<std::shared_ptr<DatasetOp>> node_ops; | |||||
| std::shared_ptr<BuildVocabOp> build_vocab_op; | |||||
| build_vocab_op = std::make_shared<BuildVocabOp>(vocab_, columns_, freq_range_, top_k_, special_tokens_, | |||||
| special_first_, num_workers_, connector_que_size_); | |||||
| node_ops.push_back(build_vocab_op); | |||||
| return node_ops; | |||||
| } | |||||
| bool BuildVocabDataset::ValidateParams() { | |||||
| if (vocab_ == nullptr) { | |||||
| MS_LOG(ERROR) << "BuildVocab: vocab is null."; | |||||
| return false; | |||||
| } | |||||
| if (top_k_ < 0) { | |||||
| MS_LOG(ERROR) << "BuildVocab: top_k shoule be positive, but got: " << top_k_; | |||||
| return false; | |||||
| } | |||||
| if (freq_range_.first < 0 || freq_range_.second > kDeMaxFreq || freq_range_.first > freq_range_.second) { | |||||
| MS_LOG(ERROR) << "BuildVocab: requency_range [a,b] should be 0 <= a <= b (a,b are inclusive), " | |||||
| << "but got [" << freq_range_.first << ", " << freq_range_.second << "]"; | |||||
| return false; | |||||
| } | |||||
| return true; | |||||
| } | |||||
| // Function to build ConcatOp | // Function to build ConcatOp | ||||
| ConcatDataset::ConcatDataset(const std::vector<std::shared_ptr<Dataset>> &datasets) : datasets_(datasets) { | ConcatDataset::ConcatDataset(const std::vector<std::shared_ptr<Dataset>> &datasets) : datasets_(datasets) { | ||||
| this->children = datasets_; | this->children = datasets_; | ||||
| @@ -0,0 +1,64 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #include "minddata/dataset/include/text.h" | |||||
| #include "minddata/dataset/text/kernels/lookup_op.h" | |||||
| namespace mindspore { | |||||
| namespace dataset { | |||||
| namespace api { | |||||
| namespace text { | |||||
| std::shared_ptr<LookupOperation> Lookup(const std::shared_ptr<Vocab> &vocab, const std::string &unknown_token) { | |||||
| auto op = std::make_shared<LookupOperation>(vocab, unknown_token); | |||||
| if (!op->ValidateParams()) { | |||||
| return nullptr; | |||||
| } | |||||
| return op; | |||||
| } | |||||
| // LookupOperation | |||||
| LookupOperation::LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::string &unknown_token) | |||||
| : vocab_(vocab), unknown_token_(unknown_token), default_id_(Vocab::kNoTokenExists) {} | |||||
| bool LookupOperation::ValidateParams() { | |||||
| if (vocab_ == nullptr) { | |||||
| LOG(ERROR) << "Lookup: vocab object type is incorrect or null."; | |||||
| return false; | |||||
| } | |||||
| if (unknown_token_.empty()) { | |||||
| LOG(ERROR) << "Lookup: no unknown token is specified."; | |||||
| return false; | |||||
| } else { | |||||
| default_id_ = vocab_->Lookup(unknown_token_); | |||||
| if (default_id_ == Vocab::kNoTokenExists) { | |||||
| LOG(ERROR) << "Lookup: unknown_token: [" + unknown_token_ + "], does not exist in vocab."; | |||||
| return false; | |||||
| } | |||||
| } | |||||
| return true; | |||||
| } | |||||
| std::shared_ptr<TensorOp> LookupOperation::Build() { | |||||
| std::shared_ptr<LookupOp> tensor_op = std::make_shared<LookupOp>(vocab_, default_id_); | |||||
| return tensor_op; | |||||
| } | |||||
| } // namespace text | |||||
| } // namespace api | |||||
| } // namespace dataset | |||||
| } // namespace mindspore | |||||
| @@ -59,6 +59,8 @@ inline void BitClear(uint32_t *bits, uint32_t bitMask) { *bits &= (~bitMask); } | |||||
| constexpr int32_t kDeMaxDim = std::numeric_limits<int32_t>::max(); // 2147483647 or 2^32 -1 | constexpr int32_t kDeMaxDim = std::numeric_limits<int32_t>::max(); // 2147483647 or 2^32 -1 | ||||
| constexpr int32_t kDeMaxRank = std::numeric_limits<int32_t>::max(); | constexpr int32_t kDeMaxRank = std::numeric_limits<int32_t>::max(); | ||||
| constexpr int64_t kDeMaxFreq = std::numeric_limits<int64_t>::max(); // 9223372036854775807 or 2^(64-1) | |||||
| constexpr int64_t kDeMaxTopk = std::numeric_limits<int64_t>::max(); | |||||
| constexpr uint32_t kCfgRowsPerBuffer = 1; | constexpr uint32_t kCfgRowsPerBuffer = 1; | ||||
| constexpr uint32_t kCfgParallelWorkers = 4; | constexpr uint32_t kCfgParallelWorkers = 4; | ||||
| @@ -30,6 +30,7 @@ | |||||
| #include "minddata/dataset/include/iterator.h" | #include "minddata/dataset/include/iterator.h" | ||||
| #include "minddata/dataset/include/samplers.h" | #include "minddata/dataset/include/samplers.h" | ||||
| #include "minddata/dataset/include/type_id.h" | #include "minddata/dataset/include/type_id.h" | ||||
| #include "minddata/dataset/text/vocab.h" | |||||
| namespace mindspore { | namespace mindspore { | ||||
| namespace dataset { | namespace dataset { | ||||
| @@ -39,6 +40,7 @@ class DatasetOp; | |||||
| class DataSchema; | class DataSchema; | ||||
| class Tensor; | class Tensor; | ||||
| class TensorShape; | class TensorShape; | ||||
| class Vocab; | |||||
| namespace api { | namespace api { | ||||
| @@ -61,6 +63,7 @@ class TextFileDataset; | |||||
| class VOCDataset; | class VOCDataset; | ||||
| // Dataset Op classes (in alphabetical order) | // Dataset Op classes (in alphabetical order) | ||||
| class BatchDataset; | class BatchDataset; | ||||
| class BuildVocabDataset; | |||||
| class ConcatDataset; | class ConcatDataset; | ||||
| class MapDataset; | class MapDataset; | ||||
| class ProjectDataset; | class ProjectDataset; | ||||
| @@ -325,6 +328,24 @@ class Dataset : public std::enable_shared_from_this<Dataset> { | |||||
| /// \return Shared pointer to the current BatchDataset | /// \return Shared pointer to the current BatchDataset | ||||
| std::shared_ptr<BatchDataset> Batch(int32_t batch_size, bool drop_remainder = false); | std::shared_ptr<BatchDataset> Batch(int32_t batch_size, bool drop_remainder = false); | ||||
| /// \brief Function to create a Vocab from source dataset | |||||
| /// \notes Build a vocab from a dataset. This would collect all the unique words in a dataset and return a vocab | |||||
| /// which contains top_k most frequent words (if top_k is specified) | |||||
| /// \param[in] columns Column names to get words from. It can be a vector of column names | |||||
| /// \param[in] freq_range A tuple of integers (min_frequency, max_frequency). Words within the frequency | |||||
| /// range would be kept. 0 <= min_frequency <= max_frequency <= total_words. min_frequency/max_frequency | |||||
| /// can be set to default, which corresponds to 0/total_words separately | |||||
| /// \param[in] top_k Number of words to be built into vocab. top_k most frequent words are | |||||
| // taken. The top_k is taken after freq_range. If not enough top_k, all words will be taken | |||||
| /// \param[in] special_tokens A list of strings, each one is a special token | |||||
| /// \param[in] special_first Whether special_tokens will be prepended/appended to vocab, If special_tokens | |||||
| /// is specified and special_first is set to default, special_tokens will be prepended | |||||
| /// \return Shared pointer to the current Vocab | |||||
| std::shared_ptr<Vocab> BuildVocab(const std::vector<std::string> &columns = {}, | |||||
| const std::pair<int64_t, int64_t> &freq_range = {0, kDeMaxFreq}, | |||||
| int64_t top_k = kDeMaxTopk, const std::vector<std::string> &special_tokens = {}, | |||||
| bool special_first = true); | |||||
| /// \brief Function to create a ConcatDataset | /// \brief Function to create a ConcatDataset | ||||
| /// \notes Concat the datasets in the input | /// \notes Concat the datasets in the input | ||||
| /// \param[in] datasets List of shared pointers to the dataset that should be concatenated together | /// \param[in] datasets List of shared pointers to the dataset that should be concatenated together | ||||
| @@ -859,6 +880,33 @@ class BatchDataset : public Dataset { | |||||
| std::map<std::string, std::pair<TensorShape, std::shared_ptr<Tensor>>> pad_map_; | std::map<std::string, std::pair<TensorShape, std::shared_ptr<Tensor>>> pad_map_; | ||||
| }; | }; | ||||
| class BuildVocabDataset : public Dataset { | |||||
| public: | |||||
| /// \brief Constructor | |||||
| BuildVocabDataset(std::shared_ptr<Vocab> vocab, const std::vector<std::string> &columns, | |||||
| const std::pair<int64_t, int64_t> &freq_range, int64_t top_k, | |||||
| const std::vector<std::string> &special_tokens, bool special_first); | |||||
| /// \brief Destructor | |||||
| ~BuildVocabDataset() = default; | |||||
| /// \brief a base class override function to create the required runtime dataset op objects for this class | |||||
| /// \return The list of shared pointers to the newly created DatasetOps | |||||
| std::vector<std::shared_ptr<DatasetOp>> Build() override; | |||||
| /// \brief Parameters validation | |||||
| /// \return bool true if all the params are valid | |||||
| bool ValidateParams() override; | |||||
| private: | |||||
| std::shared_ptr<Vocab> vocab_; | |||||
| std::vector<std::string> columns_; | |||||
| std::pair<int64_t, int64_t> freq_range_; | |||||
| int64_t top_k_; | |||||
| std::vector<std::string> special_tokens_; | |||||
| bool special_first_; | |||||
| }; | |||||
| class ConcatDataset : public Dataset { | class ConcatDataset : public Dataset { | ||||
| public: | public: | ||||
| /// \brief Constructor | /// \brief Constructor | ||||
| @@ -0,0 +1,65 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_API_TEXT_H_ | |||||
| #define MINDSPORE_CCSRC_MINDDATA_DATASET_API_TEXT_H_ | |||||
| #include <vector> | |||||
| #include <memory> | |||||
| #include <string> | |||||
| #include "minddata/dataset/core/constants.h" | |||||
| #include "minddata/dataset/include/transforms.h" | |||||
| #include "minddata/dataset/text/vocab.h" | |||||
| namespace mindspore { | |||||
| namespace dataset { | |||||
| namespace api { | |||||
| // Transform operations for text | |||||
| namespace text { | |||||
| // Text Op classes (in alphabetical order) | |||||
| class LookupOperation; | |||||
| /// \brief Lookup operator that looks up a word to an id. | |||||
| /// \param[in] vocab a Vocab object. | |||||
| /// \param[in] unknown_token word to use for lookup if the word being looked up is out of Vocabulary (oov). | |||||
| /// If unknown_token is oov, runtime error will be thrown | |||||
| /// \return Shared pointer to the current TensorOperation. | |||||
| std::shared_ptr<LookupOperation> Lookup(const std::shared_ptr<Vocab> &vocab, const std::string &unknown_token); | |||||
| /* ####################################### Derived TensorOperation classes ################################# */ | |||||
| class LookupOperation : public TensorOperation { | |||||
| public: | |||||
| explicit LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::string &unknown_token); | |||||
| ~LookupOperation() = default; | |||||
| std::shared_ptr<TensorOp> Build() override; | |||||
| bool ValidateParams() override; | |||||
| private: | |||||
| std::shared_ptr<Vocab> vocab_; | |||||
| std::string unknown_token_; | |||||
| int32_t default_id_; | |||||
| }; | |||||
| } // namespace text | |||||
| } // namespace api | |||||
| } // namespace dataset | |||||
| } // namespace mindspore | |||||
| #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_API_TEXT_H_ | |||||
| @@ -17,8 +17,10 @@ | |||||
| #include <unordered_set> | #include <unordered_set> | ||||
| #include <unordered_map> | #include <unordered_map> | ||||
| #include <utility> | #include <utility> | ||||
| #include <algorithm> | |||||
| #include "minddata/dataset/text/vocab.h" | #include "minddata/dataset/text/vocab.h" | ||||
| #include "utils/log_adapter.h" | |||||
| namespace mindspore { | namespace mindspore { | ||||
| namespace dataset { | namespace dataset { | ||||
| @@ -51,6 +53,147 @@ Status Vocab::BuildFromPyList(const py::list &words, const py::list &special_tok | |||||
| return Status::OK(); | return Status::OK(); | ||||
| } | } | ||||
| Status Vocab::BuildFromPyDict(const py::dict &words, std::shared_ptr<Vocab> *vocab) { | |||||
| std::unordered_map<WordType, WordIdType> word2id; | |||||
| for (auto p : words) { | |||||
| word2id[py::str(p.first)] = py::reinterpret_borrow<py::int_>(p.second); | |||||
| } | |||||
| *vocab = std::make_shared<Vocab>(std::move(word2id)); | |||||
| return Status::OK(); | |||||
| } | |||||
| void Vocab::append_word(const std::string &word) { | |||||
| if (word2id_.find(word) == word2id_.end()) { | |||||
| word2id_[word] = word2id_.size(); | |||||
| } | |||||
| } | |||||
| Status Vocab::BuildFromUnorderedMap(const std::unordered_map<WordType, WordIdType> &words, | |||||
| std::shared_ptr<Vocab> *vocab) { | |||||
| // Validate parameters and build map | |||||
| std::unordered_map<WordType, WordIdType> word2id; | |||||
| for (auto p : words) { | |||||
| if (p.second < 0) { | |||||
| MS_LOG(ERROR) << "index can not be negetive, but got " << p.second; | |||||
| RETURN_STATUS_UNEXPECTED("index can not be negetive, but got " + std::to_string(p.second)); | |||||
| } | |||||
| word2id[p.first] = p.second; | |||||
| } | |||||
| *vocab = std::make_shared<Vocab>(std::move(word2id)); | |||||
| return Status::OK(); | |||||
| } | |||||
| Status Vocab::BuildFromVector(const std::vector<WordType> &words, const std::vector<WordType> &special_tokens, | |||||
| bool prepend_special, std::shared_ptr<Vocab> *vocab) { | |||||
| // Validate parameters | |||||
| std::string duplicate_word; | |||||
| for (const WordType &word : words) { | |||||
| if (std::count(words.begin(), words.end(), word) > 1) { | |||||
| if (duplicate_word.find(word) == std::string::npos) { | |||||
| duplicate_word = duplicate_word + ", " + word; | |||||
| } | |||||
| } | |||||
| } | |||||
| if (!duplicate_word.empty()) { | |||||
| MS_LOG(ERROR) << "words contains duplicate word: " << duplicate_word; | |||||
| RETURN_STATUS_UNEXPECTED("words contains duplicate word: " + duplicate_word); | |||||
| } | |||||
| std::string duplicate_sp; | |||||
| for (const WordType &sp : special_tokens) { | |||||
| if (std::count(special_tokens.begin(), special_tokens.end(), sp) > 1) { | |||||
| if (duplicate_sp.find(sp) == std::string::npos) { | |||||
| duplicate_sp = duplicate_sp + ", " + sp; | |||||
| } | |||||
| } | |||||
| } | |||||
| if (!duplicate_sp.empty()) { | |||||
| MS_LOG(ERROR) << "special_tokens contains duplicate word: " << duplicate_sp; | |||||
| RETURN_STATUS_UNEXPECTED("special_tokens contains duplicate word: " + duplicate_sp); | |||||
| } | |||||
| std::unordered_map<WordType, WordIdType> word2id; | |||||
| // if special is added in front, normal words id will start from number of special tokens | |||||
| WordIdType word_id = prepend_special ? static_cast<WordIdType>(special_tokens.size()) : 0; | |||||
| for (auto word : words) { | |||||
| word2id[word] = word_id++; | |||||
| } | |||||
| word_id = prepend_special ? 0 : word2id.size(); | |||||
| for (auto special_token : special_tokens) { | |||||
| word2id[special_token] = word_id++; | |||||
| } | |||||
| *vocab = std::make_shared<Vocab>(std::move(word2id)); | |||||
| return Status::OK(); | |||||
| } | |||||
| Status Vocab::BuildFromFileCpp(const std::string &path, const std::string &delimiter, int32_t vocab_size, | |||||
| const std::vector<WordType> &special_tokens, bool prepend_special, | |||||
| std::shared_ptr<Vocab> *vocab) { | |||||
| // Validate parameters | |||||
| if (vocab_size < 0 && vocab_size != -1) { | |||||
| MS_LOG(ERROR) << "vocab_size shoule be either -1 or positive integer, but got " << vocab_size; | |||||
| RETURN_STATUS_UNEXPECTED("vocab_size shoule be either -1 or positive integer, but got " + | |||||
| std::to_string(vocab_size)); | |||||
| } | |||||
| std::string duplicate_sp; | |||||
| for (const WordType &sp : special_tokens) { | |||||
| if (std::count(special_tokens.begin(), special_tokens.end(), sp) > 1) { | |||||
| if (duplicate_sp.find(sp) == std::string::npos) { | |||||
| duplicate_sp = duplicate_sp + ", " + sp; | |||||
| } | |||||
| } | |||||
| } | |||||
| if (!duplicate_sp.empty()) { | |||||
| MS_LOG(ERROR) << "special_tokens contains duplicate word: " << duplicate_sp; | |||||
| RETURN_STATUS_UNEXPECTED("special_tokens contains duplicate word: " + duplicate_sp); | |||||
| } | |||||
| std::unordered_set<std::string> specials; | |||||
| // used to check that words in file don't contain any special token that already exists | |||||
| for (auto word : special_tokens) { | |||||
| specials.insert(word); | |||||
| } | |||||
| WordIdType word_id = prepend_special ? static_cast<WordIdType>(special_tokens.size()) : 0; | |||||
| std::unordered_map<WordType, WordIdType> word2id; | |||||
| std::fstream handle(path, std::ios::in); | |||||
| if (!handle.good() || !handle.is_open()) { | |||||
| MS_LOG(ERROR) << "fail to open:" + path; | |||||
| RETURN_STATUS_UNEXPECTED("fail to open:" + path); | |||||
| } | |||||
| std::string word; | |||||
| while (std::getline(handle, word)) { | |||||
| if (!delimiter.empty()) { | |||||
| // if delimiter is not found, find_first_of would return std::string::npos which is -1 | |||||
| word = word.substr(0, word.find_first_of(delimiter)); | |||||
| } | |||||
| if (word2id.find(word) != word2id.end()) { | |||||
| MS_LOG(ERROR) << "duplicate word:" + word + "."; | |||||
| RETURN_STATUS_UNEXPECTED("duplicate word:" + word + "."); | |||||
| } | |||||
| if (specials.find(word) != specials.end()) { | |||||
| MS_LOG(ERROR) << word + " is already in special_tokens."; | |||||
| RETURN_STATUS_UNEXPECTED(word + " is already in special_tokens."); | |||||
| } | |||||
| word2id[word] = word_id++; | |||||
| // break if enough row is read, if vocab_size is smaller than 0 | |||||
| if (word2id.size() == vocab_size) break; | |||||
| } | |||||
| word_id = prepend_special ? 0 : word2id.size(); | |||||
| for (auto special_token : special_tokens) { | |||||
| word2id[special_token] = word_id++; | |||||
| } | |||||
| *vocab = std::make_shared<Vocab>(std::move(word2id)); | |||||
| return Status::OK(); | |||||
| } | |||||
| Status Vocab::BuildFromFile(const std::string &path, const std::string &delimiter, int32_t vocab_size, | Status Vocab::BuildFromFile(const std::string &path, const std::string &delimiter, int32_t vocab_size, | ||||
| const py::list &special_tokens, bool prepend_special, std::shared_ptr<Vocab> *vocab) { | const py::list &special_tokens, bool prepend_special, std::shared_ptr<Vocab> *vocab) { | ||||
| // python validator checks special_tokens doesn't contain any duplicate words | // python validator checks special_tokens doesn't contain any duplicate words | ||||
| @@ -86,21 +229,6 @@ Status Vocab::BuildFromFile(const std::string &path, const std::string &delimite | |||||
| return Status::OK(); | return Status::OK(); | ||||
| } | } | ||||
| Status Vocab::BuildFromPyDict(const py::dict &words, std::shared_ptr<Vocab> *vocab) { | |||||
| std::unordered_map<WordType, WordIdType> word2id; | |||||
| for (auto p : words) { | |||||
| word2id[py::str(p.first)] = py::reinterpret_borrow<py::int_>(p.second); | |||||
| } | |||||
| *vocab = std::make_shared<Vocab>(std::move(word2id)); | |||||
| return Status::OK(); | |||||
| } | |||||
| void Vocab::append_word(const std::string &word) { | |||||
| if (word2id_.find(word) == word2id_.end()) { | |||||
| word2id_[word] = word2id_.size(); | |||||
| } | |||||
| } | |||||
| const WordIdType Vocab::kNoTokenExists = -1; | const WordIdType Vocab::kNoTokenExists = -1; | ||||
| } // namespace dataset | } // namespace dataset | ||||
| @@ -57,6 +57,34 @@ class Vocab { | |||||
| static Status BuildFromFile(const std::string &path, const std::string &delimiter, int32_t vocab_size, | static Status BuildFromFile(const std::string &path, const std::string &delimiter, int32_t vocab_size, | ||||
| const py::list &special_tokens, bool prepend_special, std::shared_ptr<Vocab> *vocab); | const py::list &special_tokens, bool prepend_special, std::shared_ptr<Vocab> *vocab); | ||||
| /// \brief Build a vocab from a c++ map. id needs to start from 2, no duplicate and continuous | |||||
| /// \param[in] words An unordered_map containing word, word id pair. | |||||
| /// \param[out] vocab A vocab object | |||||
| /// \return Error code | |||||
| static Status BuildFromUnorderedMap(const std::unordered_map<WordType, WordIdType> &words, | |||||
| std::shared_ptr<Vocab> *vocab); | |||||
| /// \brief Build a vocab from a c++ vector. id needs to start from 2, no duplicate and continuous | |||||
| /// \param[in] words A vector of string, used to build vocab, id starts from 2 | |||||
| /// \param[in] special_tokens A vector of string contain special tokens | |||||
| /// \param[in] prepend_special Whether special_tokens will be prepended/appended to vocab | |||||
| /// \param[out] vocab A vocab object | |||||
| /// \return Error code | |||||
| static Status BuildFromVector(const std::vector<WordType> &words, const std::vector<WordType> &special_tokens, | |||||
| bool prepend_special, std::shared_ptr<Vocab> *vocab); | |||||
| /// \brief Build a vocab from reading a vocab file, id are automatically assigned, start from 2 | |||||
| /// \param[in] path Path to vocab file , each line is assumed to contain 1 word | |||||
| /// \param[in] delimiter Delimiter to break each line with | |||||
| /// \param[in] vocab_size Number of words to read from file | |||||
| /// \param[in] special_tokens A vector of string contain special tokens | |||||
| /// \param[in] prepend_special Whether special_tokens will be prepended/appended to vocab | |||||
| /// \param[out] vocab A vocab object | |||||
| /// \return Error code | |||||
| static Status BuildFromFileCpp(const std::string &path, const std::string &delimiter, int32_t vocab_size, | |||||
| const std::vector<WordType> &special_tokens, bool prepend_special, | |||||
| std::shared_ptr<Vocab> *vocab); | |||||
| // Lookup the id of a word, if word doesn't exist in vocab, return default_id | // Lookup the id of a word, if word doesn't exist in vocab, return default_id | ||||
| // @param const WordType word - word to look up | // @param const WordType word - word to look up | ||||
| // @param WordIdType default_id - word id to return to user when its not in the vocab | // @param WordIdType default_id - word id to return to user when its not in the vocab | ||||
| @@ -97,6 +97,7 @@ SET(DE_UT_SRCS | |||||
| concatenate_op_test.cc | concatenate_op_test.cc | ||||
| cyclic_array_test.cc | cyclic_array_test.cc | ||||
| perf_data_test.cc | perf_data_test.cc | ||||
| build_vocab_test.cc | |||||
| c_api_samplers_test.cc | c_api_samplers_test.cc | ||||
| c_api_transforms_test.cc | c_api_transforms_test.cc | ||||
| c_api_dataset_ops_test.cc | c_api_dataset_ops_test.cc | ||||
| @@ -104,12 +105,13 @@ SET(DE_UT_SRCS | |||||
| c_api_dataset_clue_test.cc | c_api_dataset_clue_test.cc | ||||
| c_api_dataset_coco_test.cc | c_api_dataset_coco_test.cc | ||||
| c_api_dataset_csv_test.cc | c_api_dataset_csv_test.cc | ||||
| c_api_dataset_filetext_test.cc | |||||
| c_api_dataset_textfile_test.cc | |||||
| c_api_dataset_manifest_test.cc | c_api_dataset_manifest_test.cc | ||||
| c_api_dataset_randomdata_test.cc | c_api_dataset_randomdata_test.cc | ||||
| c_api_dataset_voc_test.cc | c_api_dataset_voc_test.cc | ||||
| c_api_datasets_test.cc | c_api_datasets_test.cc | ||||
| c_api_dataset_iterator_test.cc | c_api_dataset_iterator_test.cc | ||||
| c_api_dataset_vocab.cc | |||||
| tensor_op_fusion_pass_test.cc | tensor_op_fusion_pass_test.cc | ||||
| sliding_window_op_test.cc | sliding_window_op_test.cc | ||||
| epoch_ctrl_op_test.cc | epoch_ctrl_op_test.cc | ||||
| @@ -0,0 +1,229 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #include <fstream> | |||||
| #include <iostream> | |||||
| #include <memory> | |||||
| #include <vector> | |||||
| #include <string> | |||||
| #include "common/common.h" | |||||
| #include "minddata/dataset/include/datasets.h" | |||||
| #include "minddata/dataset/include/status.h" | |||||
| using mindspore::dataset::Tensor; | |||||
| using mindspore::dataset::Status; | |||||
| using mindspore::dataset::Vocab; | |||||
| class MindDataTestVocab : public UT::DatasetOpTesting { | |||||
| protected: | |||||
| }; | |||||
| TEST_F(MindDataTestVocab, TestVocabFromUnorderedMap) { | |||||
| MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromUnorderedMap."; | |||||
| // Build a map | |||||
| std::unordered_map<std::string, int32_t> dict; | |||||
| dict["banana"] = 0; | |||||
| dict["apple"] = 1; | |||||
| dict["cat"] = 2; | |||||
| dict["dog"] = 3; | |||||
| // Build vocab from map | |||||
| std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); | |||||
| Status s = Vocab::BuildFromUnorderedMap(dict, &vocab); | |||||
| EXPECT_EQ(s, Status::OK()); | |||||
| // Look up specified words | |||||
| std::vector<std::string> words = {"apple", "dog", "egg"}; | |||||
| std::vector<int32_t> expected = {1, 3, -1}; | |||||
| for (uint32_t i = 0; i < words.size(); ++i) { | |||||
| int32_t x = vocab->Lookup(words[i]); | |||||
| EXPECT_EQ(x, expected[i]); | |||||
| } | |||||
| } | |||||
| TEST_F(MindDataTestVocab, TestVocabFromEmptyMap) { | |||||
| MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromEmptyMap."; | |||||
| // Build vocab from empty map | |||||
| std::unordered_map<std::string, int32_t> dict; | |||||
| std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); | |||||
| Status s = Vocab::BuildFromUnorderedMap(dict, &vocab); | |||||
| EXPECT_EQ(s, Status::OK()); | |||||
| // Look up specified words | |||||
| // Expect that we will return -1 when word is not in vocab | |||||
| std::vector<std::string> words = {"apple", "dog", "egg"}; | |||||
| std::vector<int32_t> expected = {-1, -1, -1}; | |||||
| for (uint32_t i = 0; i < words.size(); ++i) { | |||||
| int32_t x = vocab->Lookup(words[i]); | |||||
| EXPECT_EQ(x, expected[i]); | |||||
| } | |||||
| } | |||||
| TEST_F(MindDataTestVocab, TestVocabFromMapFail) { | |||||
| MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromMapFail."; | |||||
| // Build a map | |||||
| std::unordered_map<std::string, int32_t> dict; | |||||
| dict["banana"] = 0; | |||||
| dict["apple"] = -1; | |||||
| // Expected failure: index of word can not be negative | |||||
| std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); | |||||
| Status s = Vocab::BuildFromUnorderedMap(dict, &vocab); | |||||
| EXPECT_NE(s, Status::OK()); | |||||
| } | |||||
| TEST_F(MindDataTestVocab, TestVocabFromVectorPrependSpTokens) { | |||||
| MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromVectorPrependSpTokens."; | |||||
| // Build vocab from a vector of words, special tokens are prepended to vocab | |||||
| std::vector<std::string> list = {"apple", "banana", "cat", "dog", "egg"}; | |||||
| std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); | |||||
| Status s = Vocab::BuildFromVector(list, {"<unk>"}, true, &vocab); | |||||
| EXPECT_EQ(s, Status::OK()); | |||||
| // Look up specified words | |||||
| // Expect that we will return -1 when word is not in vocab | |||||
| std::vector<std::string> words = {"apple", "banana", "fox"}; | |||||
| std::vector<int32_t> expected = {1, 2, -1}; | |||||
| for (uint32_t i = 0; i < words.size(); ++i) { | |||||
| int32_t x = vocab->Lookup(words[i]); | |||||
| EXPECT_EQ(x, expected[i]); | |||||
| } | |||||
| } | |||||
| TEST_F(MindDataTestVocab, TestVocabFromVectorAppendSpTokens) { | |||||
| MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromVectorAppendSpTokens."; | |||||
| // Build vocab from a vector of words, special tokens are appended to vocab | |||||
| std::vector<std::string> list = {"apple", "banana", "cat", "dog", "egg"}; | |||||
| std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); | |||||
| Status s = Vocab::BuildFromVector(list, {"<unk>"}, false, &vocab); | |||||
| EXPECT_EQ(s, Status::OK()); | |||||
| // Look up specified words | |||||
| std::vector<std::string> words = {"apple", "<unk>", "fox"}; | |||||
| std::vector<int32_t> expected = {0, 5, -1}; | |||||
| for (uint32_t i = 0; i < words.size(); ++i) { | |||||
| int32_t x = vocab->Lookup(words[i]); | |||||
| EXPECT_EQ(x, expected[i]); | |||||
| } | |||||
| } | |||||
| TEST_F(MindDataTestVocab, TestVocabFromVectorWithNoSpTokens) { | |||||
| MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromVectorWithNoSpTokens."; | |||||
| // Build vocab from a vector of words with no special tokens | |||||
| std::vector<std::string> list = {"apple", "banana", "cat", "dog", "egg"}; | |||||
| std::vector<std::string> sp_tokens = {}; | |||||
| std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); | |||||
| Status s = Vocab::BuildFromVector(list, sp_tokens, true, &vocab); | |||||
| EXPECT_EQ(s, Status::OK()); | |||||
| // Look up specified words | |||||
| std::vector<std::string> words = {"apple", "banana", "fox", "<pad>"}; | |||||
| std::vector<int32_t> expected = {0, 1, -1, -1}; | |||||
| for (uint32_t i = 0; i < words.size(); ++i) { | |||||
| int32_t x = vocab->Lookup(words[i]); | |||||
| EXPECT_EQ(x, expected[i]); | |||||
| } | |||||
| } | |||||
| TEST_F(MindDataTestVocab, TestVocabFromEmptyVector) { | |||||
| MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromEmptyVector."; | |||||
| // Build vocab from empty vector | |||||
| std::vector<std::string> list = {}; | |||||
| std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); | |||||
| Status s = Vocab::BuildFromVector(list, {}, false, &vocab); | |||||
| EXPECT_EQ(s, Status::OK()); | |||||
| // Look up specified words | |||||
| // Expect that we will return -1 when word is not in vocab | |||||
| std::vector<std::string> words = {"apple", "banana", "fox"}; | |||||
| std::vector<int32_t> expected = {-1, -1, -1}; | |||||
| for (uint32_t i = 0; i < words.size(); ++i) { | |||||
| int32_t x = vocab->Lookup(words[i]); | |||||
| EXPECT_EQ(x, expected[i]); | |||||
| } | |||||
| } | |||||
| TEST_F(MindDataTestVocab, TestVocabFromVectorFail1) { | |||||
| MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromVectorFail1."; | |||||
| // Build vocab from a vector of words with no special tokens | |||||
| std::vector<std::string> list = {"apple", "apple", "cat", "cat", "egg"}; | |||||
| std::vector<std::string> sp_tokens = {}; | |||||
| std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); | |||||
| // Expected failure: duplicate word apple | |||||
| Status s = Vocab::BuildFromVector(list, sp_tokens, true, &vocab); | |||||
| EXPECT_NE(s, Status::OK()); | |||||
| } | |||||
| TEST_F(MindDataTestVocab, TestVocabFromVectorFail2) { | |||||
| MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromVectorFail2."; | |||||
| // Build vocab from a vector of words with no special tokens | |||||
| std::vector<std::string> list = {"apple", "dog", "egg"}; | |||||
| std::vector<std::string> sp_tokens = {"<pad>", "<unk>", "<pad>", "<unk>", "<none>"}; | |||||
| std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); | |||||
| // Expected failure: duplicate special token <pad> <unk> | |||||
| Status s = Vocab::BuildFromVector(list, sp_tokens, true, &vocab); | |||||
| EXPECT_NE(s, Status::OK()); | |||||
| } | |||||
| TEST_F(MindDataTestVocab, TestVocabFromFile) { | |||||
| MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromFile."; | |||||
| // Build vocab from local file | |||||
| std::string vocab_dir = datasets_root_path_ + "/testVocab/vocab_list.txt"; | |||||
| std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); | |||||
| Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -1, {"<pad>", "<unk>"}, true, &vocab); | |||||
| EXPECT_EQ(s, Status::OK()); | |||||
| // Look up specified words | |||||
| std::vector<std::string> words = {"not", "all"}; | |||||
| std::vector<int32_t> expected = {2, 3}; | |||||
| for (uint32_t i = 0; i < words.size(); ++i) { | |||||
| int32_t x = vocab->Lookup(words[i]); | |||||
| EXPECT_EQ(x, expected[i]); | |||||
| } | |||||
| } | |||||
| TEST_F(MindDataTestVocab, TestVocabFromFileFail1) { | |||||
| MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromFileFail1."; | |||||
| // Build vocab from local file which is not exist | |||||
| std::string vocab_dir = datasets_root_path_ + "/testVocab/not_exist.txt"; | |||||
| std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); | |||||
| Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -1, {}, true, &vocab); | |||||
| EXPECT_NE(s, Status::OK()); | |||||
| } | |||||
| TEST_F(MindDataTestVocab, TestVocabFromFileFail2) { | |||||
| MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromFileFail2."; | |||||
| // Build vocab from local file | |||||
| std::string vocab_dir = datasets_root_path_ + "/testVocab/vocab_list.txt"; | |||||
| std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); | |||||
| // Expected failure: vocab_size shoule be either -1 or positive integer | |||||
| Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -2, {}, true, &vocab); | |||||
| EXPECT_NE(s, Status::OK()); | |||||
| } | |||||
| TEST_F(MindDataTestVocab, TestVocabFromFileFail3) { | |||||
| MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromFileFail2."; | |||||
| // Build vocab from local file which is not exist | |||||
| std::string vocab_dir = datasets_root_path_ + "/testVocab/vocab_list.txt"; | |||||
| std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); | |||||
| // Expected failure: duplicate special token <unk> | |||||
| Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -1, {"<unk>", "<unk>"}, true, &vocab); | |||||
| EXPECT_NE(s, Status::OK()); | |||||
| } | |||||
| @@ -14,7 +14,6 @@ | |||||
| * limitations under the License. | * limitations under the License. | ||||
| */ | */ | ||||
| #include "common/common.h" | #include "common/common.h" | ||||
| #include "minddata/dataset/engine/datasetops/source/voc_op.h" | |||||
| #include "minddata/dataset/include/datasets.h" | #include "minddata/dataset/include/datasets.h" | ||||
| using namespace mindspore::dataset::api; | using namespace mindspore::dataset::api; | ||||
| @@ -0,0 +1,254 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #include <fstream> | |||||
| #include <iostream> | |||||
| #include <memory> | |||||
| #include <vector> | |||||
| #include <string> | |||||
| #include "common/common.h" | |||||
| #include "minddata/dataset/include/datasets.h" | |||||
| #include "minddata/dataset/include/status.h" | |||||
| #include "minddata/dataset/include/transforms.h" | |||||
| #include "minddata/dataset/include/text.h" | |||||
| using namespace mindspore::dataset::api; | |||||
| using mindspore::dataset::ShuffleMode; | |||||
| using mindspore::dataset::Tensor; | |||||
| using mindspore::dataset::Status; | |||||
| using mindspore::dataset::Vocab; | |||||
| class MindDataTestPipeline : public UT::DatasetOpTesting { | |||||
| protected: | |||||
| }; | |||||
| TEST_F(MindDataTestPipeline, TestVocabLookupOp) { | |||||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabLookupOp."; | |||||
| // Create a TextFile dataset | |||||
| std::string data_file = datasets_root_path_ + "/testVocab/words.txt"; | |||||
| std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); | |||||
| EXPECT_NE(ds, nullptr); | |||||
| // Create a vocab from vector | |||||
| std::vector<std::string> list = {"home", "IS", "behind", "the", "world", "ahead", "!"}; | |||||
| std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); | |||||
| Status s = Vocab::BuildFromVector(list, {"<pad>", "<unk>"}, true, &vocab); | |||||
| EXPECT_EQ(s, Status::OK()); | |||||
| // Create Lookup operation on ds | |||||
| std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, "<unk>"); | |||||
| EXPECT_NE(lookup, nullptr); | |||||
| // Create Map operation on ds | |||||
| ds = ds->Map({lookup}, {"text"}); | |||||
| EXPECT_NE(ds, nullptr); | |||||
| // Create an iterator over the result of the above dataset | |||||
| // This will trigger the creation of the Execution Tree and launch it. | |||||
| std::shared_ptr<Iterator> iter = ds->CreateIterator(); | |||||
| EXPECT_NE(iter, nullptr); | |||||
| // Iterate the dataset and get each row | |||||
| std::unordered_map<std::string, std::shared_ptr<Tensor>> row; | |||||
| iter->GetNextRow(&row); | |||||
| uint64_t i = 0; | |||||
| std::vector<int32_t> expected = {2, 1, 4, 5, 6, 7}; | |||||
| while (row.size() != 0) { | |||||
| auto ind = row["text"]; | |||||
| MS_LOG(INFO) << ind->shape() << " " << *ind; | |||||
| std::shared_ptr<Tensor> expected_item; | |||||
| Tensor::CreateScalar(expected[i], &expected_item); | |||||
| EXPECT_EQ(*ind, *expected_item); | |||||
| iter->GetNextRow(&row); | |||||
| i++; | |||||
| } | |||||
| } | |||||
| TEST_F(MindDataTestPipeline, TestVocabLookupOpFail1) { | |||||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabLookupOpFail1."; | |||||
| // Create a TextFile Dataset | |||||
| std::string data_file = datasets_root_path_ + "/testVocab/words.txt"; | |||||
| std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); | |||||
| EXPECT_NE(ds, nullptr); | |||||
| // Build vocab from vector | |||||
| std::vector<std::string> list = {"home", "IS", "behind", "the", "world", "ahead", "!"}; | |||||
| std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); | |||||
| Status s = Vocab::BuildFromVector(list, {}, true, &vocab); | |||||
| EXPECT_EQ(s, Status::OK()); | |||||
| // Create lookup op for ds | |||||
| // Expected failure: "<unk>" is not a word of vocab | |||||
| std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, "<unk>"); | |||||
| EXPECT_EQ(lookup, nullptr); | |||||
| } | |||||
| TEST_F(MindDataTestPipeline, TestVocabLookupOpFail2) { | |||||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabLookupOpFail2."; | |||||
| // Vocab has nothing | |||||
| std::shared_ptr<Vocab> vocab; | |||||
| // Create lookup op | |||||
| // Expected failure: vocab is null | |||||
| std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, ""); | |||||
| EXPECT_EQ(lookup, nullptr); | |||||
| } | |||||
| TEST_F(MindDataTestPipeline, TestVocabLookupOpWithEmptyUnknownToken) { | |||||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabLookupOpWithEmptyUnknownToken."; | |||||
| // Create a TextFile dataset | |||||
| std::string data_file = datasets_root_path_ + "/testVocab/words.txt"; | |||||
| std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); | |||||
| EXPECT_NE(ds, nullptr); | |||||
| // Create a vocab from map | |||||
| std::unordered_map<std::string, int32_t> dict; | |||||
| dict["Home"] = 3; | |||||
| std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); | |||||
| Status s = Vocab::BuildFromUnorderedMap(dict, &vocab); | |||||
| EXPECT_EQ(s, Status::OK()); | |||||
| // Create Lookup operation on ds | |||||
| // Expected failure: "" is not a word of vocab | |||||
| std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, ""); | |||||
| EXPECT_EQ(lookup, nullptr); | |||||
| } | |||||
| TEST_F(MindDataTestPipeline, TestVocabFromDataset) { | |||||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabFromDataset."; | |||||
| // Create a TextFile dataset | |||||
| std::string data_file = datasets_root_path_ + "/testVocab/words.txt"; | |||||
| std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); | |||||
| EXPECT_NE(ds, nullptr); | |||||
| // Create vocab from dataset | |||||
| std::shared_ptr<Vocab> vocab = ds->BuildVocab({"text"}, {0, std::numeric_limits<int64_t>::max()}, | |||||
| std::numeric_limits<int64_t>::max(), {"<pad>", "<unk>"}, true); | |||||
| EXPECT_NE(vocab, nullptr); | |||||
| // Check if vocab has words or not | |||||
| int32_t home_index = vocab->Lookup("home"); | |||||
| EXPECT_EQ(home_index, 4); | |||||
| // Create Lookup operation on ds | |||||
| std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, "<unk>"); | |||||
| EXPECT_NE(lookup, nullptr); | |||||
| // Create Map operation on ds | |||||
| ds = ds->Map({lookup}, {"text"}); | |||||
| EXPECT_NE(ds, nullptr); | |||||
| // Create an iterator over the result of the above dataset | |||||
| // This will trigger the creation of the Execution Tree and launch it. | |||||
| std::shared_ptr<Iterator> iter = ds->CreateIterator(); | |||||
| EXPECT_NE(iter, nullptr); | |||||
| // Iterate the dataset and get each row | |||||
| std::unordered_map<std::string, std::shared_ptr<Tensor>> row; | |||||
| iter->GetNextRow(&row); | |||||
| uint64_t i = 0; | |||||
| std::vector<int32_t> expected = {4, 5, 3, 6, 7, 2}; | |||||
| while (row.size() != 0) { | |||||
| auto ind = row["text"]; | |||||
| MS_LOG(INFO) << ind->shape() << " " << *ind; | |||||
| std::shared_ptr<Tensor> expected_item; | |||||
| Tensor::CreateScalar(expected[i], &expected_item); | |||||
| EXPECT_EQ(*ind, *expected_item); | |||||
| iter->GetNextRow(&row); | |||||
| i++; | |||||
| } | |||||
| } | |||||
| TEST_F(MindDataTestPipeline, TestVocabFromDatasetDefault) { | |||||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabFromDatasetDefault."; | |||||
| // Create a TextFile dataset | |||||
| std::string data_file = datasets_root_path_ + "/testVocab/words.txt"; | |||||
| std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); | |||||
| EXPECT_NE(ds, nullptr); | |||||
| // Create vocab from dataset | |||||
| std::shared_ptr<Vocab> vocab = ds->BuildVocab(); | |||||
| EXPECT_NE(vocab, nullptr); | |||||
| // Check if vocab has words or not | |||||
| int32_t home_index = vocab->Lookup("home"); | |||||
| EXPECT_EQ(home_index, 2); | |||||
| // Create Lookup operation on ds | |||||
| std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, "home"); | |||||
| EXPECT_NE(lookup, nullptr); | |||||
| // Create Map operation on ds | |||||
| ds = ds->Map({lookup}); | |||||
| EXPECT_NE(ds, nullptr); | |||||
| // Create an iterator over the result of the above dataset | |||||
| // This will trigger the creation of the Execution Tree and launch it. | |||||
| std::shared_ptr<Iterator> iter = ds->CreateIterator(); | |||||
| EXPECT_NE(iter, nullptr); | |||||
| // Iterate the dataset and get each row | |||||
| std::unordered_map<std::string, std::shared_ptr<Tensor>> row; | |||||
| iter->GetNextRow(&row); | |||||
| uint64_t i = 0; | |||||
| std::vector<int32_t> expected = {2, 3, 1, 4, 5, 0}; | |||||
| while (row.size() != 0) { | |||||
| auto ind = row["text"]; | |||||
| MS_LOG(INFO) << ind->shape() << " " << *ind; | |||||
| std::shared_ptr<Tensor> expected_item; | |||||
| Tensor::CreateScalar(expected[i], &expected_item); | |||||
| EXPECT_EQ(*ind, *expected_item); | |||||
| iter->GetNextRow(&row); | |||||
| i++; | |||||
| } | |||||
| } | |||||
| TEST_F(MindDataTestPipeline, TestVocabFromDatasetFail1) { | |||||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabFromDatasetFail1."; | |||||
| // Create a TextFile dataset | |||||
| std::string data_file = datasets_root_path_ + "/testVocab/words.txt"; | |||||
| std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); | |||||
| EXPECT_NE(ds, nullptr); | |||||
| // Create vocab from dataset | |||||
| // Expected failure: top_k can not be negative | |||||
| std::shared_ptr<Vocab> vocab = ds->BuildVocab({"text"}, {0, std::numeric_limits<int64_t>::max()}, | |||||
| -2, {"<pad>", "<unk>"}, true); | |||||
| EXPECT_EQ(vocab, nullptr); | |||||
| } | |||||
| TEST_F(MindDataTestPipeline, TestVocabFromDatasetFail2) { | |||||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabFromDatasetFail2."; | |||||
| // Create a TextFile dataset | |||||
| std::string data_file = datasets_root_path_ + "/testVocab/words.txt"; | |||||
| std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); | |||||
| EXPECT_NE(ds, nullptr); | |||||
| // Create vocab from dataset | |||||
| // Expected failure: requency_range [a,b] should be 0 <= a <= b | |||||
| std::shared_ptr<Vocab> vocab = ds->BuildVocab({"text"}, {4, 1}, | |||||
| std::numeric_limits<int64_t>::max(), {"<pad>", "<unk>"}, true); | |||||
| EXPECT_EQ(vocab, nullptr); | |||||
| } | |||||