Merge pull request !21823 from 张渝/CharNGramtags/v1.6.0
| @@ -19,6 +19,7 @@ | |||
| #include "minddata/dataset/api/python/pybind_register.h" | |||
| #include "minddata/dataset/include/dataset/constants.h" | |||
| #include "minddata/dataset/text/char_n_gram.h" | |||
| #include "minddata/dataset/text/fast_text.h" | |||
| #include "minddata/dataset/text/glove.h" | |||
| #include "minddata/dataset/text/sentence_piece_vocab.h" | |||
| @@ -90,6 +91,16 @@ PYBIND_REGISTER(SentencePieceModel, 0, ([](const py::module *m) { | |||
| .export_values(); | |||
| })); | |||
| PYBIND_REGISTER(CharNGram, 1, ([](const py::module *m) { | |||
| (void)py::class_<CharNGram, Vectors, std::shared_ptr<CharNGram>>(*m, "CharNGram") | |||
| .def(py::init<>()) | |||
| .def_static("from_file", [](const std::string &path, int32_t max_vectors) { | |||
| std::shared_ptr<CharNGram> char_n_gram; | |||
| THROW_IF_ERROR(CharNGram::BuildFromFile(&char_n_gram, path, max_vectors)); | |||
| return char_n_gram; | |||
| }); | |||
| })); | |||
| PYBIND_REGISTER(FastText, 1, ([](const py::module *m) { | |||
| (void)py::class_<FastText, Vectors, std::shared_ptr<FastText>>(*m, "FastText") | |||
| .def(py::init<>()) | |||
| @@ -4,6 +4,7 @@ add_subdirectory(kernels) | |||
| file(GLOB _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc") | |||
| set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD) | |||
| add_library(text OBJECT | |||
| char_n_gram.cc | |||
| fast_text.cc | |||
| glove.cc | |||
| sentence_piece_vocab.cc | |||
| @@ -0,0 +1,98 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "minddata/dataset/text/char_n_gram.h" | |||
| #include "utils/file_utils.h" | |||
| namespace mindspore { | |||
| namespace dataset { | |||
| CharNGram::CharNGram(const std::unordered_map<std::string, std::vector<float>> &map, int32_t dim) : Vectors(map, dim) {} | |||
| Status CharNGram::BuildFromFile(std::shared_ptr<CharNGram> *char_n_gram, const std::string &path, int32_t max_vectors) { | |||
| RETURN_UNEXPECTED_IF_NULL(char_n_gram); | |||
| std::unordered_map<std::string, std::vector<float>> map; | |||
| int vector_dim = -1; | |||
| RETURN_IF_NOT_OK(CharNGram::Load(path, max_vectors, &map, &vector_dim)); | |||
| *char_n_gram = std::make_shared<CharNGram>(std::move(map), vector_dim); | |||
| return Status::OK(); | |||
| } | |||
| std::vector<float> CharNGram::Lookup(const std::string &token, const std::vector<float> &unk_init, | |||
| bool lower_case_backup) { | |||
| std::vector<float> init_vec(dim_, 0); | |||
| if (!unk_init.empty()) { | |||
| if (unk_init.size() != dim_) { | |||
| MS_LOG(WARNING) << "CharNGram: size of unk_init is not the same as vectors, will initialize with zero vectors."; | |||
| } else { | |||
| init_vec = unk_init; | |||
| } | |||
| } | |||
| std::string lower_token = token; | |||
| if (lower_case_backup) { | |||
| std::transform(lower_token.begin(), lower_token.end(), lower_token.begin(), ::tolower); | |||
| } | |||
| std::vector<std::string> chars; | |||
| chars.push_back("#BEGIN#"); | |||
| for (int i = 0; i < lower_token.length(); i++) { | |||
| std::string s; | |||
| s.push_back(lower_token[i]); // Convert a char type letter to a string type. | |||
| chars.push_back(s); | |||
| } | |||
| chars.push_back("#END#"); | |||
| int len = chars.size(); | |||
| int num_vectors = 0; | |||
| std::vector<float> vector_value_sum(dim_, 0); | |||
| std::vector<float> vector_value_temp; | |||
| // The length of meaningful characters in the pre-training file is 2, 3, 4. | |||
| const int slice_len[3] = {2, 3, 4}; | |||
| const int slice_len_size = sizeof(slice_len) / sizeof(slice_len[0]); | |||
| for (int i = 0; i < slice_len_size; i++) { | |||
| int end = len - slice_len[i] + 1; | |||
| for (int pos = 0; pos < end; pos++) { | |||
| std::vector<std::string> gram_vec; | |||
| std::vector<std::string>::const_iterator first = chars.begin() + pos; | |||
| std::vector<std::string>::const_iterator second = first + slice_len[i]; | |||
| gram_vec.assign(first, second); | |||
| std::string c = ""; | |||
| std::string gram = std::accumulate(gram_vec.begin(), gram_vec.end(), c); | |||
| std::string gram_key = std::to_string(slice_len[i]) + "gram-" + gram; | |||
| auto str_index = map_.find(gram_key); | |||
| if (str_index == map_.end()) { | |||
| vector_value_temp = init_vec; | |||
| } else { | |||
| vector_value_temp = str_index->second; | |||
| } | |||
| if (vector_value_temp != init_vec) { | |||
| std::transform(vector_value_temp.begin(), vector_value_temp.end(), vector_value_sum.begin(), | |||
| vector_value_sum.begin(), std::plus<float>()); | |||
| num_vectors++; | |||
| } | |||
| } | |||
| } | |||
| std::vector<float> vector_value(dim_, 0); | |||
| if (num_vectors > 0) { | |||
| std::transform(vector_value_sum.begin(), vector_value_sum.end(), vector_value.begin(), | |||
| [&num_vectors](float value) -> float { return value / num_vectors; }); | |||
| return vector_value; | |||
| } else { | |||
| return init_vec; | |||
| } | |||
| } | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| @@ -0,0 +1,64 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_CHAR_N_GRAM_H_ | |||
| #define MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_CHAR_N_GRAM_H_ | |||
| #include <algorithm> | |||
| #include <functional> | |||
| #include <memory> | |||
| #include <string> | |||
| #include <unordered_map> | |||
| #include <utility> | |||
| #include <vector> | |||
| #include "minddata/dataset/text/vectors.h" | |||
| namespace mindspore { | |||
| namespace dataset { | |||
| /// \brief Build CharNGram vectors from reading a Pre-train word vectors. | |||
| class CharNGram : public Vectors { | |||
| public: | |||
| // Constructor. | |||
| CharNGram() = default; | |||
| /// Constructor. | |||
| /// \param[in] map A map between string and vector. | |||
| /// \param[in] dim Dimension of the vectors. | |||
| CharNGram(const std::unordered_map<std::string, std::vector<float>> &map, int32_t dim); | |||
| // Destructor. | |||
| ~CharNGram() = default; | |||
| /// \brief Build CharNGram from reading a CharNGram pre-train vector file. | |||
| /// \param[out] char_n_gram CharNGram object which contains the pre-train vectors. | |||
| /// \param[in] path Path to the CharNGram pre-trained word vector file. | |||
| /// \param[in] max_vectors This can be used to limit the number of pre-trained vectors loaded (default=0, no limit). | |||
| static Status BuildFromFile(std::shared_ptr<CharNGram> *char_n_gram, const std::string &path, | |||
| int32_t max_vectors = 0); | |||
| /// \brief Look up embedding vectors of token. | |||
| /// \param[in] token A token to be looked up. | |||
| /// \param[in] unk_init In case of the token is out-of-vectors (OOV), the result will be initialized with `unk_init`. | |||
| /// (default={}, means to initialize with zero vectors). | |||
| /// \param[in] lower_case_backup Whether to look up the token in the lower case (Default = false). | |||
| /// \return The vector of the input token. | |||
| std::vector<float> Lookup(const std::string &token, const std::vector<float> &unk_init = {}, | |||
| bool lower_case_backup = false); | |||
| }; | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_CHAR_N_GRAM_H_ | |||
| @@ -20,7 +20,7 @@ | |||
| namespace mindspore { | |||
| namespace dataset { | |||
| FastText::FastText(const std::unordered_map<std::string, std::vector<float>> &map, int dim) : Vectors(map, dim) {} | |||
| FastText::FastText(const std::unordered_map<std::string, std::vector<float>> &map, int32_t dim) : Vectors(map, dim) {} | |||
| Status CheckFastText(const std::string &file_path) { | |||
| Path path = Path(file_path); | |||
| @@ -39,7 +39,7 @@ class FastText : public Vectors { | |||
| /// Constructor. | |||
| /// \param[in] map A map between string and vector. | |||
| /// \param[in] dim Dimension of the vectors. | |||
| FastText(const std::unordered_map<std::string, std::vector<float>> &map, int dim); | |||
| FastText(const std::unordered_map<std::string, std::vector<float>> &map, int32_t dim); | |||
| /// Destructor. | |||
| ~FastText() = default; | |||
| @@ -20,7 +20,7 @@ | |||
| namespace mindspore { | |||
| namespace dataset { | |||
| GloVe::GloVe(const std::unordered_map<std::string, std::vector<float>> &map, int dim) : Vectors(map, dim) {} | |||
| GloVe::GloVe(const std::unordered_map<std::string, std::vector<float>> &map, int32_t dim) : Vectors(map, dim) {} | |||
| Status CheckGloVe(const std::string &file_path) { | |||
| Path path = Path(file_path); | |||
| @@ -39,7 +39,7 @@ class GloVe : public Vectors { | |||
| /// Constructor. | |||
| /// \param[in] map A map between string and vector. | |||
| /// \param[in] dim Dimension of the vectors. | |||
| GloVe(const std::unordered_map<std::string, std::vector<float>> &map, int dim); | |||
| GloVe(const std::unordered_map<std::string, std::vector<float>> &map, int32_t dim); | |||
| /// Destructor. | |||
| ~GloVe() = default; | |||
| @@ -60,7 +60,7 @@ Status Vectors::InferShape(const std::string &path, int32_t max_vectors, int32_t | |||
| } | |||
| Status Vectors::Load(const std::string &path, int32_t max_vectors, | |||
| std::unordered_map<std::string, std::vector<float>> *map, int *vector_dim) { | |||
| std::unordered_map<std::string, std::vector<float>> *map, int32_t *vector_dim) { | |||
| RETURN_UNEXPECTED_IF_NULL(map); | |||
| RETURN_UNEXPECTED_IF_NULL(vector_dim); | |||
| auto realpath = FileUtils::GetRealPath(common::SafeCStr(path)); | |||
| @@ -107,7 +107,7 @@ Status Vectors::Load(const std::string &path, int32_t max_vectors, | |||
| return Status::OK(); | |||
| } | |||
| Vectors::Vectors(const std::unordered_map<std::string, std::vector<float>> &map, int dim) { | |||
| Vectors::Vectors(const std::unordered_map<std::string, std::vector<float>> &map, int32_t dim) { | |||
| map_ = std::move(map); | |||
| dim_ = dim; | |||
| } | |||
| @@ -40,7 +40,7 @@ class Vectors { | |||
| /// Constructor. | |||
| /// \param[in] map A map between string and vector. | |||
| /// \param[in] dim Dimension of the vectors. | |||
| Vectors(const std::unordered_map<std::string, std::vector<float>> &map, int dim); | |||
| Vectors(const std::unordered_map<std::string, std::vector<float>> &map, int32_t dim); | |||
| /// Destructor. | |||
| virtual ~Vectors() = default; | |||
| @@ -61,7 +61,7 @@ class Vectors { | |||
| bool lower_case_backup = false); | |||
| /// \brief Getter of dimension. | |||
| const int &Dim() const { return dim_; } | |||
| const int32_t &Dim() const { return dim_; } | |||
| protected: | |||
| /// \brief Infer the shape of the pre-trained word vector file. | |||
| @@ -79,9 +79,9 @@ class Vectors { | |||
| /// \param[out] map The map between words and vectors. | |||
| /// \param[out] vector_dim The dimension of the vectors in the file. | |||
| static Status Load(const std::string &path, int32_t max_vectors, | |||
| std::unordered_map<std::string, std::vector<float>> *map, int *vector_dim); | |||
| std::unordered_map<std::string, std::vector<float>> *map, int32_t *vector_dim); | |||
| int dim_; | |||
| int32_t dim_; | |||
| std::unordered_map<std::string, std::vector<float>> map_; | |||
| }; | |||
| } // namespace dataset | |||
| @@ -28,14 +28,14 @@ import platform | |||
| from .transforms import Lookup, JiebaTokenizer, UnicodeCharTokenizer, Ngram, WordpieceTokenizer, \ | |||
| TruncateSequencePair, ToNumber, SlidingWindow, SentencePieceTokenizer, PythonTokenizer, ToVectors | |||
| from .utils import to_str, to_bytes, JiebaMode, Vocab, NormalizeForm, SentencePieceVocab, SentencePieceModel, \ | |||
| SPieceTokenizerOutType, SPieceTokenizerLoadType, Vectors, FastText, GloVe | |||
| SPieceTokenizerOutType, SPieceTokenizerLoadType, Vectors, FastText, GloVe, CharNGram | |||
| __all__ = [ | |||
| "Lookup", "JiebaTokenizer", "UnicodeCharTokenizer", "Ngram", | |||
| "to_str", "to_bytes", "Vocab", "WordpieceTokenizer", "TruncateSequencePair", "ToNumber", | |||
| "PythonTokenizer", "SlidingWindow", "SentencePieceVocab", "SentencePieceTokenizer", "SPieceTokenizerOutType", | |||
| "SentencePieceModel", "SPieceTokenizerLoadType", "JiebaMode", "NormalizeForm", "Vectors", "ToVectors", "FastText", | |||
| "GloVe" | |||
| "GloVe", "CharNGram" | |||
| ] | |||
| if platform.system().lower() != 'windows': | |||
| @@ -27,7 +27,7 @@ from .validators import check_from_file, check_from_list, check_from_dict, check | |||
| check_from_file_vectors | |||
| __all__ = [ | |||
| "Vocab", "SentencePieceVocab", "to_str", "to_bytes", "Vectors", "FastText", "GloVe" | |||
| "Vocab", "SentencePieceVocab", "to_str", "to_bytes", "Vectors", "FastText", "GloVe", "CharNGram" | |||
| ] | |||
| @@ -465,3 +465,29 @@ class GloVe(cde.GloVe): | |||
| max_vectors = max_vectors if max_vectors is not None else 0 | |||
| return super().from_file(file_path, max_vectors) | |||
| class CharNGram(cde.CharNGram): | |||
| """ | |||
| CharNGram object that is used to map tokens into pre-trained vectors. | |||
| """ | |||
| @classmethod | |||
| @check_from_file_vectors | |||
| def from_file(cls, file_path, max_vectors=None): | |||
| """ | |||
| Build a CharNGram vector from a file. | |||
| Args: | |||
| file_path (str): Path of the file that contains the CharNGram vectors. | |||
| max_vectors (int, optional): This can be used to limit the number of pre-trained vectors loaded. | |||
| Most pre-trained vector sets are sorted in the descending order of word frequency. Thus, in | |||
| situations where the entire set doesn’t fit in memory, or is not needed for another reason, | |||
| passing max_vectors can limit the size of the loaded set (default=None, no limit). | |||
| Examples: | |||
| >>> char_n_gram = text.CharNGram.from_file("/path/to/char_n_gram/file", max_vectors=None) | |||
| """ | |||
| max_vectors = max_vectors if max_vectors is not None else 0 | |||
| return super().from_file(file_path, max_vectors) | |||
| @@ -53,6 +53,7 @@ SET(DE_UT_SRCS | |||
| c_api_repeat_test.cc | |||
| c_api_samplers_test.cc | |||
| c_api_text_sentence_piece_vocab_test.cc | |||
| c_api_text_test.cc | |||
| c_api_text_vocab_test.cc | |||
| c_api_text_test.cc | |||
| c_api_transforms_test.cc | |||
| @@ -23,6 +23,7 @@ | |||
| #include "minddata/dataset/include/dataset/datasets.h" | |||
| #include "minddata/dataset/include/dataset/text.h" | |||
| #include "minddata/dataset/include/dataset/transforms.h" | |||
| #include "minddata/dataset/text/char_n_gram.h" | |||
| #include "minddata/dataset/text/fast_text.h" | |||
| #include "minddata/dataset/text/glove.h" | |||
| #include "minddata/dataset/text/vectors.h" | |||
| @@ -30,6 +31,7 @@ | |||
| using namespace mindspore::dataset; | |||
| using mindspore::Status; | |||
| using mindspore::dataset::CharNGram; | |||
| using mindspore::dataset::FastText; | |||
| using mindspore::dataset::GloVe; | |||
| using mindspore::dataset::ShuffleMode; | |||
| @@ -1658,7 +1660,8 @@ TEST_F(MindDataTestPipeline, TestToNumberFail1) { | |||
| EXPECT_NE(ds, nullptr); | |||
| // Create ToNumber operation on ds | |||
| std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeInt8); | |||
| std::shared_ptr<TensorTransform> to_number = | |||
| std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeInt8); | |||
| EXPECT_NE(to_number, nullptr); | |||
| // Create a Map operation on ds | |||
| @@ -3740,7 +3743,7 @@ TEST_F(MindDataTestPipeline, TestVectorsUnknownInit) { | |||
| /// `unknown_init` and `lower_case_backup` in function Lookup. But some tokens have some big letters | |||
| /// Expectation: return correct MSTensor which is equal to the expected | |||
| TEST_F(MindDataTestPipeline, TestVectorsAllParams) { | |||
| // Test with all parameters. | |||
| // Test with all parameters. | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsAllParams."; | |||
| // Create a TextFile dataset | |||
| std::string data_file = datasets_root_path_ + "/testVectors/words.txt"; | |||
| @@ -3801,7 +3804,7 @@ TEST_F(MindDataTestPipeline, TestVectorsAllParams) { | |||
| /// Description: test with pre-vectors set that have the different dimension | |||
| /// Expectation: throw correct error and message | |||
| TEST_F(MindDataTestPipeline, TestVectorsDifferentDimension) { | |||
| // Tokens don't have the same number of vectors. | |||
| // Tokens don't have the same number of vectors. | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsDifferentDimension."; | |||
| // Create a TextFile dataset | |||
| @@ -3819,7 +3822,7 @@ TEST_F(MindDataTestPipeline, TestVectorsDifferentDimension) { | |||
| /// Description: test with pre-vectors set that has the head-info | |||
| /// Expectation: return correct MSTensor which is equal to the expected | |||
| TEST_F(MindDataTestPipeline, TestVectorsWithHeadInfo) { | |||
| // Test with words that has head info. | |||
| // Test with words that has head info. | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsWithHeadInfo."; | |||
| // Create a TextFile dataset | |||
| std::string data_file = datasets_root_path_ + "/testVectors/words.txt"; | |||
| @@ -3880,7 +3883,7 @@ TEST_F(MindDataTestPipeline, TestVectorsWithHeadInfo) { | |||
| /// Description: test with the parameter max_vectors that is <= 0 | |||
| /// Expectation: throw correct error and message | |||
| TEST_F(MindDataTestPipeline, TestVectorsMaxVectorsLessThanZero) { | |||
| // Test with max_vectors <= 0. | |||
| // Test with max_vectors <= 0. | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsMaxVectorsLessThanZero."; | |||
| // Create a TextFile dataset | |||
| @@ -3898,7 +3901,7 @@ TEST_F(MindDataTestPipeline, TestVectorsMaxVectorsLessThanZero) { | |||
| /// Description: test with the pre-vectors file that is empty | |||
| /// Expectation: throw correct error and message | |||
| TEST_F(MindDataTestPipeline, TestVectorsWithEmptyFile) { | |||
| // Read empty file. | |||
| // Read empty file. | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsWithEmptyFile."; | |||
| // Create a TextFile dataset | |||
| @@ -3916,7 +3919,7 @@ TEST_F(MindDataTestPipeline, TestVectorsWithEmptyFile) { | |||
| /// Description: test with the pre-vectors file that is not exist | |||
| /// Expectation: throw correct error and message | |||
| TEST_F(MindDataTestPipeline, TestVectorsWithNotExistFile) { | |||
| // Test with not exist file. | |||
| // Test with not exist file. | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsWithNotExistFile."; | |||
| // Create a TextFile dataset | |||
| @@ -3934,7 +3937,7 @@ TEST_F(MindDataTestPipeline, TestVectorsWithNotExistFile) { | |||
| /// Description: test with the pre-vectors set that has a situation that info-head is not the first line in the set | |||
| /// Expectation: throw correct error and message | |||
| TEST_F(MindDataTestPipeline, TestVectorsWithWrongInfoFile) { | |||
| // wrong info. | |||
| // Wrong info. | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsWithWrongInfoFile."; | |||
| // Create a TextFile dataset | |||
| @@ -4137,7 +4140,7 @@ TEST_F(MindDataTestPipeline, TestFastTextUnknownInit) { | |||
| /// `unknown_init` and `lower_case_backup` in function Lookup. But some tokens have some big letters | |||
| /// Expectation: return correct MSTensor which is equal to the expected | |||
| TEST_F(MindDataTestPipeline, TestFastTextAllParams) { | |||
| // Test with all parameters. | |||
| // Test with all parameters. | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextAllParams."; | |||
| // Create a TextFile dataset | |||
| std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt"; | |||
| @@ -4198,7 +4201,7 @@ TEST_F(MindDataTestPipeline, TestFastTextAllParams) { | |||
| /// Description: test with pre-vectors set that have the different dimension | |||
| /// Expectation: throw correct error and message | |||
| TEST_F(MindDataTestPipeline, TestFastTextDifferentDimension) { | |||
| // Tokens don't have the same number of vectors. | |||
| // Tokens don't have the same number of vectors. | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextDifferentDimension."; | |||
| // Create a TextFile dataset | |||
| @@ -4216,7 +4219,7 @@ TEST_F(MindDataTestPipeline, TestFastTextDifferentDimension) { | |||
| /// Description: test with the parameter max_vectors that is <= 0 | |||
| /// Expectation: throw correct error and message | |||
| TEST_F(MindDataTestPipeline, TestFastTextMaxVectorsLessThanZero) { | |||
| // Test with max_vectors <= 0. | |||
| // Test with max_vectors <= 0. | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextMaxVectorsLessThanZero."; | |||
| // Create a TextFile dataset | |||
| @@ -4234,7 +4237,7 @@ TEST_F(MindDataTestPipeline, TestFastTextMaxVectorsLessThanZero) { | |||
| /// Description: test with the pre-vectors file that is empty | |||
| /// Expectation: throw correct error and message | |||
| TEST_F(MindDataTestPipeline, TestFastTextWithEmptyFile) { | |||
| // Read empty file. | |||
| // Read empty file. | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextWithEmptyFile."; | |||
| // Create a TextFile dataset | |||
| @@ -4252,7 +4255,7 @@ TEST_F(MindDataTestPipeline, TestFastTextWithEmptyFile) { | |||
| /// Description: test with the pre-vectors file that is not exist | |||
| /// Expectation: throw correct error and message | |||
| TEST_F(MindDataTestPipeline, TestFastTextWithNotExistFile) { | |||
| // Test with not exist file. | |||
| // Test with not exist file. | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextWithNotExistFile."; | |||
| // Create a TextFile dataset | |||
| @@ -4270,7 +4273,7 @@ TEST_F(MindDataTestPipeline, TestFastTextWithNotExistFile) { | |||
| /// Description: test with the pre-vectors set that has a situation that info-head is not the first line in the set | |||
| /// Expectation: throw correct error and message | |||
| TEST_F(MindDataTestPipeline, TestFastTextWithWrongInfoFile) { | |||
| // wrong info. | |||
| // Wrong info. | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextWithWrongInfoFile."; | |||
| // Create a TextFile dataset | |||
| @@ -4288,7 +4291,7 @@ TEST_F(MindDataTestPipeline, TestFastTextWithWrongInfoFile) { | |||
| /// Description: test with the pre-vectors set that has a wrong suffix | |||
| /// Expectation: throw correct error and message | |||
| TEST_F(MindDataTestPipeline, TestFastTextWithWrongSuffix) { | |||
| // wrong info. | |||
| // Wrong info. | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextWithWrongSuffix."; | |||
| // Create a TextFile dataset | |||
| @@ -4497,7 +4500,7 @@ TEST_F(MindDataTestPipeline, TestGloVeUnknownInit) { | |||
| /// `unknown_init` and `lower_case_backup` in function Lookup. But some tokens have some big letters | |||
| /// Expectation: return correct MSTensor which is equal to the expected | |||
| TEST_F(MindDataTestPipeline, TestGloVeAllParams) { | |||
| // Test with all parameters. | |||
| // Test with all parameters. | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestGloVeAllParams."; | |||
| // Create a TextFile dataset | |||
| std::string data_file = datasets_root_path_ + "/testGloVe/words.txt"; | |||
| @@ -4560,7 +4563,7 @@ TEST_F(MindDataTestPipeline, TestGloVeAllParams) { | |||
| /// Description: test with pre-vectors set that have the different dimension | |||
| /// Expectation: throw correct error and message | |||
| TEST_F(MindDataTestPipeline, TestGloVeDifferentDimension) { | |||
| // Tokens don't have the same number of glove. | |||
| // Tokens don't have the same number of glove. | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestGloVeDifferentDimension."; | |||
| // Create a TextFile dataset | |||
| @@ -4578,7 +4581,7 @@ TEST_F(MindDataTestPipeline, TestGloVeDifferentDimension) { | |||
| /// Description: test with the parameter max_vectors that is <= 0 | |||
| /// Expectation: throw correct error and message | |||
| TEST_F(MindDataTestPipeline, TestGloVeMaxVectorsLessThanZero) { | |||
| // Test with max_vectors <= 0. | |||
| // Test with max_vectors <= 0. | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestGloVeMaxVectorsLessThanZero."; | |||
| // Create a TextFile dataset | |||
| @@ -4596,7 +4599,7 @@ TEST_F(MindDataTestPipeline, TestGloVeMaxVectorsLessThanZero) { | |||
| /// Description: test with the pre-vectors file that is empty | |||
| /// Expectation: throw correct error and message | |||
| TEST_F(MindDataTestPipeline, TestGloVeWithEmptyFile) { | |||
| // Read empty file. | |||
| // Read empty file. | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestGloVeWithEmptyFile."; | |||
| // Create a TextFile dataset | |||
| @@ -4614,7 +4617,7 @@ TEST_F(MindDataTestPipeline, TestGloVeWithEmptyFile) { | |||
| /// Description: test with the pre-vectors file that is not exist | |||
| /// Expectation: throw correct error and message | |||
| TEST_F(MindDataTestPipeline, TestGloVeWithNotExistFile) { | |||
| // Test with not exist file. | |||
| // Test with not exist file. | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestGloVeWithNotExistFile."; | |||
| // Create a TextFile dataset | |||
| @@ -4632,7 +4635,7 @@ TEST_F(MindDataTestPipeline, TestGloVeWithNotExistFile) { | |||
| /// Description: test with the pre-vectors set that has a situation that info-head is not the first line in the set | |||
| /// Expectation: throw correct error and message | |||
| TEST_F(MindDataTestPipeline, TestGloVeWithWrongInfoFile) { | |||
| // wrong info. | |||
| // Wrong info. | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestGloVeWithWrongInfoFile."; | |||
| // Create a TextFile dataset | |||
| @@ -4650,7 +4653,7 @@ TEST_F(MindDataTestPipeline, TestGloVeWithWrongInfoFile) { | |||
| /// Description: test with the pre-vectors set that has a wrong format | |||
| /// Expectation: throw correct error and message | |||
| TEST_F(MindDataTestPipeline, TestGloVeWithWrongFormat) { | |||
| // wrong info. | |||
| // Wrong info. | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestGloVeWithWrongFormat."; | |||
| // Create a TextFile dataset | |||
| @@ -4663,3 +4666,324 @@ TEST_F(MindDataTestPipeline, TestGloVeWithWrongFormat) { | |||
| Status s = GloVe::BuildFromFile(&glove, vectors_dir); | |||
| EXPECT_NE(s, Status::OK()); | |||
| } | |||
| /// Feature: CharNGram | |||
| /// Description: test with default parameter in function BuildFromFile and function Lookup | |||
| /// Expectation: return correct MSTensor which is equal to the excepted | |||
| TEST_F(MindDataTestPipeline, TestCharNGramDefaultParam) { | |||
| // Test with default parameter. | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCharNGramDefaultParam."; | |||
| // Create a TextFile dataset | |||
| std::string data_file = datasets_root_path_ + "/testVectors/words.txt"; | |||
| std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); | |||
| EXPECT_NE(ds, nullptr); | |||
| std::string vectors_dir = datasets_root_path_ + "/testVectors/char_n_gram_20.txt"; | |||
| std::shared_ptr<CharNGram> char_n_gram; | |||
| Status s = CharNGram::BuildFromFile(&char_n_gram, vectors_dir); | |||
| EXPECT_EQ(s, Status::OK()); | |||
| std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(char_n_gram); | |||
| EXPECT_NE(lookup, nullptr); | |||
| // Create Map operation on ds | |||
| ds = ds->Map({lookup}, {"text"}); | |||
| EXPECT_NE(ds, nullptr); | |||
| // Create an iterator over the result of the above dataset | |||
| std::shared_ptr<Iterator> iter = ds->CreateIterator(); | |||
| EXPECT_NE(iter, nullptr); | |||
| // Iterate the dataset and get each row | |||
| std::unordered_map<std::string, mindspore::MSTensor> row; | |||
| ASSERT_OK(iter->GetNextRow(&row)); | |||
| uint64_t i = 0; | |||
| std::vector<std::vector<float>> expected = {{0,0,0,0,0}, | |||
| {0,0,0,0,0}, | |||
| {0.117336,0.362446,-0.983326,0.939264,-0.05648}, | |||
| {0.657201,2.11761,-1.59276,0.432072,1.21395}, | |||
| {0,0,0,0,0}, | |||
| {-2.26956,0.288491,-0.740001,0.661703,0.147355}, | |||
| {0,0,0,0,0}}; | |||
| while (row.size() != 0) { | |||
| auto ind = row["text"]; | |||
| MS_LOG(INFO) << ind.Shape(); | |||
| TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind); | |||
| TensorPtr de_expected_item; | |||
| ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_item)); | |||
| mindspore::MSTensor ms_expected_item = | |||
| mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item)); | |||
| std::vector<int64_t> ind_shape = ind.Shape(); | |||
| std::vector<int64_t> ms_expected_shape = ms_expected_item.Shape(); | |||
| EXPECT_EQ(ind_shape, ms_expected_shape); | |||
| ASSERT_OK(iter->GetNextRow(&row)); | |||
| i++; | |||
| } | |||
| EXPECT_EQ(i, 7); | |||
| // Manually terminate the pipeline | |||
| iter->Stop(); | |||
| } | |||
| /// Feature: CharNGram. | |||
| /// Description: test with all parameters which include `path` and `max_vector` in function BuildFromFile | |||
| /// Expectation: return correct MSTensor which is equal to the excepted | |||
| TEST_F(MindDataTestPipeline, TestCharNGramAllBuildfromfileParams) { | |||
| // Test with two parameters. | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCharNGramAllBuildfromfileParams."; | |||
| // Create a TextFile dataset | |||
| std::string data_file = datasets_root_path_ + "/testVectors/words.txt"; | |||
| std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); | |||
| EXPECT_NE(ds, nullptr); | |||
| std::string vectors_dir = datasets_root_path_ + "/testVectors/char_n_gram_20.txt"; | |||
| std::shared_ptr<CharNGram> char_n_gram; | |||
| Status s = CharNGram::BuildFromFile(&char_n_gram, vectors_dir, 18); | |||
| EXPECT_EQ(s, Status::OK()); | |||
| std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(char_n_gram); | |||
| EXPECT_NE(lookup, nullptr); | |||
| // Create Map operation on ds | |||
| ds = ds->Map({lookup}, {"text"}); | |||
| EXPECT_NE(ds, nullptr); | |||
| // Create an iterator over the result of the above dataset | |||
| std::shared_ptr<Iterator> iter = ds->CreateIterator(); | |||
| EXPECT_NE(iter, nullptr); | |||
| // Iterate the dataset and get each row | |||
| std::unordered_map<std::string, mindspore::MSTensor> row; | |||
| ASSERT_OK(iter->GetNextRow(&row)); | |||
| uint64_t i = 0; | |||
| std::vector<std::vector<float>> expected = {{0,0,0,0,0}, | |||
| {0,0,0,0,0}, | |||
| {-0.155665,0.664073,-0.538499,1.22657,-0.2162}, | |||
| {0.657201,2.11761,-1.59276,0.432072,1.21395}, | |||
| {0,0,0,0,0}, | |||
| {-2.26956,0.288491,-0.740001,0.661703,0.147355}, | |||
| {0,0,0,0,0}}; | |||
| while (row.size() != 0) { | |||
| auto ind = row["text"]; | |||
| MS_LOG(INFO) << ind.Shape(); | |||
| TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind); | |||
| TensorPtr de_expected_item; | |||
| ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_item)); | |||
| mindspore::MSTensor ms_expected_item = | |||
| mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item)); | |||
| std::vector<int64_t> ind_shape = ind.Shape(); | |||
| std::vector<int64_t> ms_expected_shape = ms_expected_item.Shape(); | |||
| EXPECT_EQ(ind_shape, ms_expected_shape); | |||
| ASSERT_OK(iter->GetNextRow(&row)); | |||
| i++; | |||
| } | |||
| EXPECT_EQ(i, 7); | |||
| // Manually terminate the pipeline | |||
| iter->Stop(); | |||
| } | |||
| /// Feature: CharNGram | |||
| /// Description: test with all parameters in function BuildFromFile and `unknown_init` in function Lookup | |||
| /// Expectation: return correct MSTensor which is equal to the excepted | |||
| TEST_F(MindDataTestPipeline, TestCharNGramUnknownInit) { | |||
| // Test with two parameters. | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCharNGramUnknownInit."; | |||
| // Create a TextFile dataset | |||
| std::string data_file = datasets_root_path_ + "/testVectors/words.txt"; | |||
| std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); | |||
| EXPECT_NE(ds, nullptr); | |||
| std::string vectors_dir = datasets_root_path_ + "/testVectors/char_n_gram_20.txt"; | |||
| std::shared_ptr<CharNGram> char_n_gram; | |||
| Status s = CharNGram::BuildFromFile(&char_n_gram, vectors_dir, 18); | |||
| EXPECT_EQ(s, Status::OK()); | |||
| std::vector<float> unknown_init(5, -1); | |||
| std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(char_n_gram, unknown_init); | |||
| EXPECT_NE(lookup, nullptr); | |||
| // Create Map operation on ds | |||
| ds = ds->Map({lookup}, {"text"}); | |||
| EXPECT_NE(ds, nullptr); | |||
| // Create an iterator over the result of the above dataset | |||
| std::shared_ptr<Iterator> iter = ds->CreateIterator(); | |||
| EXPECT_NE(iter, nullptr); | |||
| // Iterate the dataset and get each row | |||
| std::unordered_map<std::string, mindspore::MSTensor> row; | |||
| ASSERT_OK(iter->GetNextRow(&row)); | |||
| uint64_t i = 0; | |||
| std::vector<std::vector<float>> expected = {{-1,-1,-1,-1,-1}, | |||
| {-1,-1,-1,-1,-1}, | |||
| {-0.155665,0.664073,-0.538499,1.22657,-0.2162}, | |||
| {0.657201,2.11761,-1.59276,0.432072,1.21395}, | |||
| {-1,-1,-1,-1,-1}, | |||
| {-2.26956,0.288491,-0.740001,0.661703,0.147355}, | |||
| {-1,-1,-1,-1,-1}}; | |||
| while (row.size() != 0) { | |||
| auto ind = row["text"]; | |||
| MS_LOG(INFO) << ind.Shape(); | |||
| TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind); | |||
| TensorPtr de_expected_item; | |||
| ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_item)); | |||
| mindspore::MSTensor ms_expected_item = | |||
| mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item)); | |||
| std::vector<int64_t> ind_shape = ind.Shape(); | |||
| std::vector<int64_t> ms_expected_shape = ms_expected_item.Shape(); | |||
| EXPECT_EQ(ind_shape, ms_expected_shape); | |||
| ASSERT_OK(iter->GetNextRow(&row)); | |||
| i++; | |||
| } | |||
| EXPECT_EQ(i, 7); | |||
| // Manually terminate the pipeline | |||
| iter->Stop(); | |||
| } | |||
| /// Feature: CharNGram | |||
| /// Description: test with all parameters which include `path` and `max_vectors` in function BuildFromFile and `token`, | |||
| /// `unknown_init` and `lower_case_backup` in function Lookup. But some tokens have some big letters | |||
| /// Expectation: return correct MSTensor which is equal to the excepted | |||
| TEST_F(MindDataTestPipeline, TestCharNGramAllParams) { | |||
| // Test with all parameters. | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCharNGramAllParams."; | |||
| // Create a TextFile dataset | |||
| std::string data_file = datasets_root_path_ + "/testVectors/words_with_big_letter.txt"; | |||
| std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); | |||
| EXPECT_NE(ds, nullptr); | |||
| std::string vectors_dir = datasets_root_path_ + "/testVectors/char_n_gram_20.txt"; | |||
| std::shared_ptr<CharNGram> char_n_gram; | |||
| Status s = CharNGram::BuildFromFile(&char_n_gram, vectors_dir); | |||
| EXPECT_EQ(s, Status::OK()); | |||
| std::vector<float> unknown_init(5, -1); | |||
| std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(char_n_gram, unknown_init, true); | |||
| EXPECT_NE(lookup, nullptr); | |||
| // Create Map operation on ds | |||
| ds = ds->Map({lookup}, {"text"}); | |||
| EXPECT_NE(ds, nullptr); | |||
| // Create an iterator over the result of the above dataset | |||
| std::shared_ptr<Iterator> iter = ds->CreateIterator(); | |||
| EXPECT_NE(iter, nullptr); | |||
| // Iterate the dataset and get each row | |||
| std::unordered_map<std::string, mindspore::MSTensor> row; | |||
| ASSERT_OK(iter->GetNextRow(&row)); | |||
| uint64_t i = 0; | |||
| std::vector<std::vector<float>> expected = {{-1,-1,-1,-1,-1}, | |||
| {-1,-1,-1,-1,-1}, | |||
| {0.117336,0.362446,-0.983326,0.939264,-0.05648}, | |||
| {0.657201,2.11761,-1.59276,0.432072,1.21395}, | |||
| {-1,-1,-1,-1,-1}, | |||
| {-2.26956,0.288491,-0.740001,0.661703,0.147355}, | |||
| {-1,-1,-1,-1,-1}}; | |||
| while (row.size() != 0) { | |||
| auto ind = row["text"]; | |||
| MS_LOG(INFO) << ind.Shape(); | |||
| TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind); | |||
| TensorPtr de_expected_item; | |||
| ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_item)); | |||
| mindspore::MSTensor ms_expected_item = | |||
| mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item)); | |||
| std::vector<int64_t> ind_shape = ind.Shape(); | |||
| std::vector<int64_t> ms_expected_shape = ms_expected_item.Shape(); | |||
| EXPECT_EQ(ind_shape, ms_expected_shape); | |||
| ASSERT_OK(iter->GetNextRow(&row)); | |||
| i++; | |||
| } | |||
| EXPECT_EQ(i, 7); | |||
| // Manually terminate the pipeline | |||
| iter->Stop(); | |||
| } | |||
| /// Feature: CharNGram | |||
| /// Description: test with pre-vectors set that have the different dimension | |||
| /// Expectation: throw correct error and message | |||
| TEST_F(MindDataTestPipeline, TestCharNGramDifferentDimension) { | |||
| // Tokens don't have the same number of vectors. | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCharNGramDifferentDimension."; | |||
| // Create a TextFile dataset | |||
| std::string data_file = datasets_root_path_ + "/testVectors/words.txt"; | |||
| std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); | |||
| EXPECT_NE(ds, nullptr); | |||
| std::string vectors_dir = datasets_root_path_ + "/testVectors/char_n_gram_20_dim_different.txt"; | |||
| std::shared_ptr<CharNGram> char_n_gram; | |||
| Status s = CharNGram::BuildFromFile(&char_n_gram, vectors_dir); | |||
| EXPECT_NE(s, Status::OK()); | |||
| } | |||
| /// Feature: CharNGram | |||
| /// Description: test with the parameter max_vectors that is <= 0 | |||
| /// Expectation: throw correct error and message | |||
| TEST_F(MindDataTestPipeline, TestCharNGramMaxVectorsLessThanZero) { | |||
| // Test with max_vectors <= 0. | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCharNGramMaxVectorsLessThanZero."; | |||
| // Create a TextFile dataset | |||
| std::string data_file = datasets_root_path_ + "/testVectors/words.txt"; | |||
| std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); | |||
| EXPECT_NE(ds, nullptr); | |||
| std::string vectors_dir = datasets_root_path_ + "/testVectors/char_n_gram_20.txt"; | |||
| std::shared_ptr<CharNGram> char_n_gram; | |||
| Status s = CharNGram::BuildFromFile(&char_n_gram, vectors_dir, -1); | |||
| EXPECT_NE(s, Status::OK()); | |||
| } | |||
| /// Feature: CharNGram | |||
| /// Description: test with the pre-vectors file that is empty | |||
| /// Expectation: throw correct error and message | |||
| TEST_F(MindDataTestPipeline, TestCharNGramWithEmptyFile) { | |||
| // Read empty file. | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCharNGramWithEmptyFile."; | |||
| // Create a TextFile dataset | |||
| std::string data_file = datasets_root_path_ + "/testVectors/words.txt"; | |||
| std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); | |||
| EXPECT_NE(ds, nullptr); | |||
| std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors_empty.txt"; | |||
| std::shared_ptr<CharNGram> char_n_gram; | |||
| Status s = CharNGram::BuildFromFile(&char_n_gram, vectors_dir); | |||
| EXPECT_NE(s, Status::OK()); | |||
| } | |||
| /// Feature: CharNGram | |||
| /// Description: test with the pre-vectors file that is not exist | |||
| /// Expectation: throw correct error and message | |||
| TEST_F(MindDataTestPipeline, TestCharNGramsWithNotExistFile) { | |||
| // Test with not exist file. | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCharNGramsWithNotExistFile."; | |||
| // Create a TextFile dataset | |||
| std::string data_file = datasets_root_path_ + "/testVectors/words.txt"; | |||
| std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); | |||
| EXPECT_NE(ds, nullptr); | |||
| std::string vectors_dir = datasets_root_path_ + "/testVectors/no_vectors.txt"; | |||
| std::shared_ptr<CharNGram> char_n_gram; | |||
| Status s = CharNGram::BuildFromFile(&char_n_gram, vectors_dir); | |||
| EXPECT_NE(s, Status::OK()); | |||
| } | |||
| @@ -23,6 +23,7 @@ | |||
| #include "minddata/dataset/include/dataset/vision.h" | |||
| #include "minddata/dataset/include/dataset/audio.h" | |||
| #include "minddata/dataset/include/dataset/text.h" | |||
| #include "minddata/dataset/text/char_n_gram.h" | |||
| #include "minddata/dataset/text/fast_text.h" | |||
| #include "minddata/dataset/text/glove.h" | |||
| #include "minddata/dataset/text/vectors.h" | |||
| @@ -30,6 +31,7 @@ | |||
| using namespace mindspore::dataset; | |||
| using mindspore::LogStream; | |||
| using mindspore::dataset::CharNGram; | |||
| using mindspore::dataset::FastText; | |||
| using mindspore::dataset::GloVe; | |||
| using mindspore::dataset::Vectors; | |||
| @@ -1937,6 +1939,142 @@ TEST_F(MindDataTestExecute, TestToVectorsWithInvalidParamForGloVe) { | |||
| EXPECT_FALSE(status02.IsOk()); | |||
| } | |||
| /// Feature: CharNGram | |||
| /// Description: test basic usage of CharNGram and the ToVectors with default parameter | |||
| /// Expectation: get correct MSTensor | |||
| TEST_F(MindDataTestExecute, TestCharNGramParam) { | |||
| MS_LOG(INFO) << "Doing MindDataTestExecute-TestCharNGramParam."; | |||
| std::shared_ptr<Tensor> de_tensor; | |||
| Tensor::CreateScalar<std::string>("the", &de_tensor); | |||
| auto token = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_tensor)); | |||
| mindspore::MSTensor lookup_result; | |||
| // Create expected output. | |||
| std::shared_ptr<Tensor> de_expected01; | |||
| std::vector<float> expected01 = {-0.840079,-0.0270003,-0.833472,0.588367,-0.210012}; | |||
| ASSERT_OK(Tensor::CreateFromVector(expected01, &de_expected01)); | |||
| auto ms_expected01 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected01)); | |||
| std::shared_ptr<Tensor> de_expected02; | |||
| std::vector<float> expected02 = {-1.34122,0.0442693,-0.48697,0.662939,-0.367669}; | |||
| ASSERT_OK(Tensor::CreateFromVector(expected02, &de_expected02)); | |||
| auto ms_expected02 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected02)); | |||
| // Transform params. | |||
| std::string vectors_dir = "data/dataset/testVectors/char_n_gram_20.txt"; | |||
| std::shared_ptr<CharNGram> char_n_gram01; | |||
| Status s01 = CharNGram::BuildFromFile(&char_n_gram01, vectors_dir); | |||
| EXPECT_EQ(s01, Status::OK()); | |||
| std::shared_ptr<TensorTransform> to_vectors01 = std::make_shared<text::ToVectors>(char_n_gram01); | |||
| auto transform01 = Execute({to_vectors01}); | |||
| Status status01 = transform01(token, &lookup_result); | |||
| EXPECT_EQ(lookup_result.Shape(), ms_expected01.Shape()); | |||
| EXPECT_TRUE(status01.IsOk()); | |||
| std::shared_ptr<CharNGram> char_n_gram02; | |||
| Status s02 = CharNGram::BuildFromFile(&char_n_gram02, vectors_dir, 100); | |||
| EXPECT_EQ(s02, Status::OK()); | |||
| std::shared_ptr<TensorTransform> to_vectors02 = std::make_shared<text::ToVectors>(char_n_gram02); | |||
| auto transform02 = Execute({to_vectors02}); | |||
| Status status02 = transform02(token, &lookup_result); | |||
| EXPECT_EQ(lookup_result.Shape(), ms_expected01.Shape()); | |||
| EXPECT_TRUE(status02.IsOk()); | |||
| std::shared_ptr<CharNGram> char_n_gram03; | |||
| Status s03 = CharNGram::BuildFromFile(&char_n_gram03, vectors_dir, 18); | |||
| EXPECT_EQ(s03, Status::OK()); | |||
| std::shared_ptr<TensorTransform> to_vectors03 = std::make_shared<text::ToVectors>(char_n_gram03); | |||
| auto transform03 = Execute({to_vectors03}); | |||
| Status status03 = transform03(token, &lookup_result); | |||
| EXPECT_EQ(lookup_result.Shape(), ms_expected02.Shape()); | |||
| EXPECT_TRUE(status03.IsOk()); | |||
| } | |||
| /// Feature: CharNGram | |||
| /// Description: test basic usage of ToVectors and the CharNGram with default parameter | |||
| /// Expectation: get correct MSTensor | |||
| TEST_F(MindDataTestExecute, TestToVectorsParamForCharNGram) { | |||
| MS_LOG(INFO) << "Doing MindDataTestExecute-TestToVectorsParamForCharNGram."; | |||
| std::shared_ptr<Tensor> de_tensor01; | |||
| Tensor::CreateScalar<std::string>("none", &de_tensor01); | |||
| auto token01 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_tensor01)); | |||
| std::shared_ptr<Tensor> de_tensor02; | |||
| Tensor::CreateScalar<std::string>("the", &de_tensor02); | |||
| auto token02 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_tensor02)); | |||
| std::shared_ptr<Tensor> de_tensor03; | |||
| Tensor::CreateScalar<std::string>("The", &de_tensor03); | |||
| auto token03 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_tensor03)); | |||
| mindspore::MSTensor lookup_result; | |||
| // Create expected output. | |||
| std::shared_ptr<Tensor> de_expected01; | |||
| std::vector<float> expected01(5, 0); | |||
| ASSERT_OK(Tensor::CreateFromVector(expected01, &de_expected01)); | |||
| auto ms_expected01 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected01)); | |||
| std::shared_ptr<Tensor> de_expected02; | |||
| std::vector<float> expected02(5, -1); | |||
| ASSERT_OK(Tensor::CreateFromVector(expected02, &de_expected02)); | |||
| auto ms_expected02 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected02)); | |||
| std::shared_ptr<Tensor> de_expected03; | |||
| std::vector<float> expected03 = {-0.840079,-0.0270003,-0.833472,0.588367,-0.210012}; | |||
| ASSERT_OK(Tensor::CreateFromVector(expected03, &de_expected03)); | |||
| auto ms_expected03 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected03)); | |||
| // Transform params. | |||
| std::string vectors_dir = "data/dataset/testVectors/char_n_gram_20.txt"; | |||
| std::shared_ptr<CharNGram> char_n_gram; | |||
| Status s = CharNGram::BuildFromFile(&char_n_gram, vectors_dir); | |||
| EXPECT_EQ(s, Status::OK()); | |||
| std::shared_ptr<TensorTransform> to_vectors01 = std::make_shared<text::ToVectors>(char_n_gram); | |||
| auto transform01 = Execute({to_vectors01}); | |||
| Status status01 = transform01(token01, &lookup_result); | |||
| EXPECT_EQ(lookup_result.Shape(), ms_expected01.Shape()); | |||
| EXPECT_TRUE(status01.IsOk()); | |||
| std::vector<float> unknown_init(5, -1); | |||
| std::shared_ptr<TensorTransform> to_vectors02 = std::make_shared<text::ToVectors>(char_n_gram, unknown_init); | |||
| auto transform02 = Execute({to_vectors02}); | |||
| Status status02 = transform02(token01, &lookup_result); | |||
| EXPECT_EQ(lookup_result.Shape(), ms_expected02.Shape()); | |||
| EXPECT_TRUE(status02.IsOk()); | |||
| std::shared_ptr<TensorTransform> to_vectors03 = std::make_shared<text::ToVectors>(char_n_gram, unknown_init); | |||
| auto transform03 = Execute({to_vectors03}); | |||
| Status status03 = transform03(token02, &lookup_result); | |||
| EXPECT_EQ(lookup_result.Shape(), ms_expected03.Shape()); | |||
| EXPECT_TRUE(status03.IsOk()); | |||
| std::shared_ptr<TensorTransform> to_vectors04 = std::make_shared<text::ToVectors>(char_n_gram, unknown_init, true); | |||
| auto transform04 = Execute({to_vectors04}); | |||
| Status status04 = transform04(token03, &lookup_result); | |||
| EXPECT_EQ(lookup_result.Shape(), ms_expected03.Shape()); | |||
| EXPECT_TRUE(status04.IsOk()); | |||
| } | |||
| /// Feature: CharNGram | |||
| /// Description: test invalid parameter of ToVectors | |||
| /// Expectation: throw exception correctly | |||
| TEST_F(MindDataTestExecute, TestToVectorsWithInvalidParamForCharNGram) { | |||
| MS_LOG(INFO) << "Doing MindDataTestExecute-TestToVectorsWithInvalidParamForCharNGram."; | |||
| std::shared_ptr<Tensor> de_tensor; | |||
| Tensor::CreateScalar<std::string>("none", &de_tensor); | |||
| auto token = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_tensor)); | |||
| mindspore::MSTensor lookup_result; | |||
| // Transform params. | |||
| std::string vectors_dir = "data/dataset/testVectors/char_n_gram_20.txt"; | |||
| std::shared_ptr<CharNGram> char_n_gram01; | |||
| Status s = CharNGram::BuildFromFile(&char_n_gram01, vectors_dir); | |||
| EXPECT_EQ(s, Status::OK()); | |||
| std::vector<float> unknown_init(4, -1); | |||
| std::shared_ptr<TensorTransform> to_vectors01 = std::make_shared<text::ToVectors>(char_n_gram01, unknown_init); | |||
| auto transform01 = Execute({to_vectors01}); | |||
| Status status01 = transform01(token, &lookup_result); | |||
| EXPECT_FALSE(status01.IsOk()); | |||
| std::shared_ptr<CharNGram> char_n_gram02 = nullptr; | |||
| std::shared_ptr<TensorTransform> to_vectors02 = std::make_shared<text::ToVectors>(char_n_gram02); | |||
| auto transform02 = Execute({to_vectors02}); | |||
| Status status02 = transform02(token, &lookup_result); | |||
| EXPECT_FALSE(status02.IsOk()); | |||
| } | |||
| // Feature: DBToAmplitude | |||
| // Description: test DBToAmplitude in eager mode | |||
| // Expectation: the data is processed successfully | |||
| @@ -0,0 +1,20 @@ | |||
| 1gram-e -0.655379 0.574261 -0.714026 -0.148858 -0.0534275 | |||
| 1gram-a -0.288984 -0.225616 0.323913 -0.261039 -0.0628034 | |||
| 1gram-t 0.408448 0.175862 -0.296873 -0.209094 -0.53478 | |||
| 1gram-i 0.278486 -0.910641 -0.743681 -0.734405 0.519959 | |||
| 1gram-n -0.0712582 0.0898121 -1.12567 -0.815067 -0.435836 | |||
| 1gram-o -0.182786 0.535789 -0.391385 0.181972 0.317399 | |||
| 1gram-r 0.68474 0.103464 0.201631 -0.65319 0.554142 | |||
| 1gram-s -0.175988 -0.813322 0.465603 -0.0951031 0.193374 | |||
| 1gram-h -0.39348 -0.678079 0.233101 0.431805 2.04905 | |||
| 1gram-l -0.451299 -0.268223 -0.787034 -0.991984 0.251244 | |||
| 1gram-d 0.799629 -0.326191 -0.474959 0.235657 0.796227 | |||
| 2gram-e#END# -2.26956 0.288491 -0.740001 0.661703 0.147355 | |||
| 1gram-c -0.0413309 0.436135 -0.835305 -1.64429 -1.08329 | |||
| 2gram-s#END# 0.657201 2.11761 -1.59276 0.432072 1.21395 | |||
| 1gram-u -0.25203 -0.176365 -0.263038 -0.995372 -1.24916 | |||
| 2gram-#BEGIN#t -0.96853 -0.789463 0.515762 2.02107 -1.64635 | |||
| 1gram-m 0.422293 -0.149725 -0.734202 1.27342 0.232722 | |||
| 2gram-he -0.785562 0.63378 -1.23667 -0.693956 0.395988 | |||
| 2gram-th 0.663336 -0.240809 -1.87298 0.364651 0.26296 | |||
| 2gram-n#END# -0.149612 -0.664577 -1.12344 2.23695 0.610406 | |||
| @@ -0,0 +1,20 @@ | |||
| 1gram-e -0.655379 0.574261 -0.714026 -0.148858 -0.0534275 | |||
| 1gram-a -0.288984 -0.225616 0.323913 -0.261039 -0.0628034 | |||
| 1gram-t 0.408448 0.175862 -0.296873 -0.209094 -0.53478 | |||
| 1gram-i 0.278486 -0.910641 -0.743681 -0.734405 0.519959 | |||
| 1gram-n -0.0712582 0.0898121 -1.12567 -0.815067 -0.435836 | |||
| 1gram-o -0.182786 0.535789 -0.391385 0.181972 0.317399 | |||
| 1gram-r 0.68474 0.103464 0.201631 -0.65319 0.554142 | |||
| 1gram-s -0.175988 -0.813322 0.465603 -0.0951031 0.193374 | |||
| 1gram-h -0.39348 -0.678079 0.233101 0.431805 2.04905 | |||
| 1gram-l -0.451299 -0.268223 -0.787034 -0.991984 0.251244 | |||
| 1gram-d 0.799629 -0.326191 -0.474959 0.235657 0.796227 | |||
| 2gram-e#END# -2.26956 0.288491 -0.740001 0.661703 0.147355 | |||
| 1gram-c -0.0413309 0.436135 -0.835305 -1.64429 -1.08329 | |||
| 2gram-s#END# 0.657201 2.11761 -1.59276 0.432072 1.21395 | |||
| 1gram-u -0.25203 -0.176365 -0.263038 -0.995372 -1.24916 | |||
| 2gram-#BEGIN#t -0.96853 -0.789463 0.515762 2.02107 | |||
| 1gram-m 0.422293 -0.149725 -0.734202 1.27342 0.232722 | |||
| 2gram-he -0.785562 0.63378 -1.23667 -0.693956 0.395988 | |||
| 2gram-th 0.663336 -0.240809 -1.87298 0.364651 0.26296 | |||
| 2gram-n#END# -0.149612 -0.664577 -1.12344 2.23695 0.610406 | |||
| @@ -1,7 +1,7 @@ | |||
| ok | |||
| ! | |||
| This | |||
| iS | |||
| my | |||
| HOME | |||
| . | |||
| ok | |||
| ! | |||
| This | |||
| iS | |||
| my | |||
| HOME | |||
| . | |||
| @@ -0,0 +1,217 @@ | |||
| # Copyright 2021 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================== | |||
| import numpy as np | |||
| import pytest | |||
| from mindspore import log | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.text as text | |||
| import mindspore.dataset.text.transforms as T | |||
| DATASET_ROOT_PATH = "../data/dataset/testVectors/" | |||
| def _count_unequal_element(data_expected, data_me, rtol, atol): | |||
| assert data_expected.shape == data_me.shape | |||
| total_count = len(data_expected.flatten()) | |||
| error = np.abs(data_expected - data_me) | |||
| greater = np.greater(error, atol + np.abs(data_expected)*rtol) | |||
| loss_count = np.count_nonzero(greater) | |||
| assert (loss_count/total_count) < rtol,\ | |||
| "\ndata_expected_std:{0}\ndata_me_error:{1}\nloss:{2}".\ | |||
| format(data_expected[greater], data_me[greater], error[greater]) | |||
| def allclose_nparray(data_expected, data_me, rtol, atol, equal_nan=True): | |||
| if np.any(np.isnan(data_expected)): | |||
| assert np.allclose(data_me, data_expected, rtol, atol, equal_nan=equal_nan) | |||
| elif not np.allclose(data_me, data_expected, rtol, atol, equal_nan=equal_nan): | |||
| _count_unequal_element(data_expected, data_me, rtol, atol) | |||
| else: | |||
| assert True | |||
| def test_char_n_gram_all_to_vectors_params_eager(): | |||
| """ | |||
| Feature: CharNGram | |||
| Description: test with all parameters which include `unk_init` | |||
| and `lower_case_backup` in function ToVectors in eager mode | |||
| Expectation: output is equal to the expected value | |||
| """ | |||
| char_n_gram = text.CharNGram.from_file(DATASET_ROOT_PATH + "char_n_gram_20.txt", max_vectors=18) | |||
| unk_init = (-np.ones(5)).tolist() | |||
| to_vectors = T.ToVectors(char_n_gram, unk_init=unk_init, lower_case_backup=True) | |||
| result1 = to_vectors("THE") | |||
| result2 = to_vectors(".") | |||
| result3 = to_vectors("To") | |||
| res = [[-1.34121733e+00, 4.42693333e-02, -4.86969667e-01, 6.62939000e-01, -3.67669000e-01], | |||
| [-1.00000000e+00, -1.00000000e+00, -1.00000000e+00, -1.00000000e+00, -1.00000000e+00], | |||
| [-9.68530000e-01, -7.89463000e-01, 5.15762000e-01, 2.02107000e+00, -1.64635000e+00]] | |||
| res_array = np.array(res, dtype=np.float32) | |||
| allclose_nparray(res_array[0], result1, 0.0001, 0.0001) | |||
| allclose_nparray(res_array[1], result2, 0.0001, 0.0001) | |||
| allclose_nparray(res_array[2], result3, 0.0001, 0.0001) | |||
| def test_char_n_gram_build_from_file(): | |||
| """ | |||
| Feature: CharNGram | |||
| Description: test with only default parameter | |||
| Expectation: output is equal to the expected value | |||
| """ | |||
| char_n_gram = text.CharNGram.from_file(DATASET_ROOT_PATH + "char_n_gram_20.txt") | |||
| to_vectors = text.ToVectors(char_n_gram) | |||
| data = ds.TextFileDataset(DATASET_ROOT_PATH + "words.txt", shuffle=False) | |||
| data = data.map(operations=to_vectors, input_columns=["text"]) | |||
| ind = 0 | |||
| res = [[0., 0., 0., 0., 0.], | |||
| [0., 0., 0., 0., 0.], | |||
| [0.117336, 0.362446, -0.983326, 0.939264, -0.05648], | |||
| [0.657201, 2.11761, -1.59276, 0.432072, 1.21395], | |||
| [0., 0., 0., 0., 0.], | |||
| [-2.26956, 0.288491, -0.740001, 0.661703, 0.147355], | |||
| [0., 0., 0., 0., 0.]] | |||
| for d in data.create_dict_iterator(num_epochs=1, output_numpy=True): | |||
| res_array = np.array(res[ind], dtype=np.float32) | |||
| allclose_nparray(res_array, d["text"], 0.0001, 0.0001) | |||
| ind += 1 | |||
| def test_char_n_gram_all_build_from_file_params(): | |||
| """ | |||
| Feature: CharNGram | |||
| Description: test with all parameters which include `path` and `max_vector` in function BuildFromFile | |||
| Expectation: output is equal to the expected value | |||
| """ | |||
| char_n_gram = text.CharNGram.from_file(DATASET_ROOT_PATH + "char_n_gram_20.txt", max_vectors=100) | |||
| to_vectors = text.ToVectors(char_n_gram) | |||
| data = ds.TextFileDataset(DATASET_ROOT_PATH + "words.txt", shuffle=False) | |||
| data = data.map(operations=to_vectors, input_columns=["text"]) | |||
| ind = 0 | |||
| res = [[0., 0., 0., 0., 0.], | |||
| [0., 0., 0., 0., 0.], | |||
| [0.117336, 0.362446, -0.983326, 0.939264, -0.05648], | |||
| [0.657201, 2.11761, -1.59276, 0.432072, 1.21395], | |||
| [0., 0., 0., 0., 0.], | |||
| [-2.26956, 0.288491, -0.740001, 0.661703, 0.147355], | |||
| [0., 0., 0., 0., 0.]] | |||
| for d in data.create_dict_iterator(num_epochs=1, output_numpy=True): | |||
| res_array = np.array(res[ind], dtype=np.float32) | |||
| allclose_nparray(res_array, d["text"], 0.0001, 0.0001) | |||
| ind += 1 | |||
| def test_char_n_gram_all_build_from_file_params_eager(): | |||
| """ | |||
| Feature: CharNGram | |||
| Description: test with all parameters which include `path` and `max_vector` in function BuildFromFile in eager mode | |||
| Expectation: output is equal to the expected value | |||
| """ | |||
| char_n_gram = text.CharNGram.from_file(DATASET_ROOT_PATH + "char_n_gram_20.txt", max_vectors=18) | |||
| to_vectors = T.ToVectors(char_n_gram) | |||
| result1 = to_vectors("the") | |||
| result2 = to_vectors(".") | |||
| result3 = to_vectors("to") | |||
| res = [[-1.34121733e+00, 4.42693333e-02, -4.86969667e-01, 6.62939000e-01, -3.67669000e-01], | |||
| [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00], | |||
| [-9.68530000e-01, -7.89463000e-01, 5.15762000e-01, 2.02107000e+00, -1.64635000e+00]] | |||
| res_array = np.array(res, dtype=np.float32) | |||
| allclose_nparray(res_array[0], result1, 0.0001, 0.0001) | |||
| allclose_nparray(res_array[1], result2, 0.0001, 0.0001) | |||
| allclose_nparray(res_array[2], result3, 0.0001, 0.0001) | |||
| def test_char_n_gram_build_from_file_eager(): | |||
| """ | |||
| Feature: CharNGram | |||
| Description: test with only default parameter in eager mode | |||
| Expectation: output is equal to the expected value | |||
| """ | |||
| char_n_gram = text.CharNGram.from_file(DATASET_ROOT_PATH + "char_n_gram_20.txt") | |||
| to_vectors = T.ToVectors(char_n_gram) | |||
| result1 = to_vectors("the") | |||
| result2 = to_vectors(".") | |||
| result3 = to_vectors("to") | |||
| res = [[-8.40079000e-01, -2.70002500e-02, -8.33472250e-01, 5.88367000e-01, -2.10011750e-01], | |||
| [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00], | |||
| [-9.68530000e-01, -7.89463000e-01, 5.15762000e-01, 2.02107000e+00, -1.64635000e+00]] | |||
| res_array = np.array(res, dtype=np.float32) | |||
| allclose_nparray(res_array[0], result1, 0.0001, 0.0001) | |||
| allclose_nparray(res_array[1], result2, 0.0001, 0.0001) | |||
| allclose_nparray(res_array[2], result3, 0.0001, 0.0001) | |||
| def test_char_n_gram_invalid_input(): | |||
| """ | |||
| Feature: CharNGram | |||
| Description: test the validate function with invalid parameters. | |||
| Expectation: Verification of correct error message for invalid input. | |||
| """ | |||
| def test_invalid_input(test_name, file_path, error, error_msg, max_vectors=None, | |||
| unk_init=None, lower_case_backup=False, token="ok"): | |||
| log.info("Test CharNGram with wrong input: {0}".format(test_name)) | |||
| with pytest.raises(error) as error_info: | |||
| char_n_gram = text.CharNGram.from_file(file_path, max_vectors=max_vectors) | |||
| to_vectors = T.ToVectors(char_n_gram, unk_init=unk_init, lower_case_backup=lower_case_backup) | |||
| to_vectors(token) | |||
| assert error_msg in str(error_info.value) | |||
| test_invalid_input("Not all vectors have the same number of dimensions", | |||
| DATASET_ROOT_PATH + "char_n_gram_20_dim_different.txt", error=RuntimeError, | |||
| error_msg="all vectors must have the same number of dimensions, " + | |||
| "but got dim 4 while expecting 5") | |||
| test_invalid_input("the file is empty.", DATASET_ROOT_PATH + "vectors_empty.txt", | |||
| error=RuntimeError, error_msg="invalid file, file is empty.") | |||
| test_invalid_input("the count of `unknown_init`'s element is different with word vector.", | |||
| DATASET_ROOT_PATH + "char_n_gram_20.txt", | |||
| error=RuntimeError, error_msg="unk_init must be the same length as vectors, " + | |||
| "but got unk_init: 6 and vectors: 5", unk_init=np.ones(6).tolist()) | |||
| test_invalid_input("The file not exist", DATASET_ROOT_PATH + "not_exist.txt", RuntimeError, | |||
| error_msg="get real path failed") | |||
| test_invalid_input("max_vectors parameter must be greater than 0", | |||
| DATASET_ROOT_PATH + "char_n_gram_20.txt", error=ValueError, | |||
| error_msg="Input max_vectors is not within the required interval", max_vectors=-1) | |||
| test_invalid_input("invalid max_vectors parameter type as a float", | |||
| DATASET_ROOT_PATH + "char_n_gram_20.txt", error=TypeError, | |||
| error_msg="Argument max_vectors with value 1.0 is not of type [<class 'int'>]," | |||
| " but got <class 'float'>.", max_vectors=1.0) | |||
| test_invalid_input("invalid max_vectors parameter type as a string", | |||
| DATASET_ROOT_PATH + "char_n_gram_20.txt", error=TypeError, | |||
| error_msg="Argument max_vectors with value 1 is not of type [<class 'int'>]," | |||
| " but got <class 'str'>.", max_vectors="1") | |||
| test_invalid_input("invalid token parameter type as a float", | |||
| DATASET_ROOT_PATH + "char_n_gram_20.txt", error=RuntimeError, | |||
| error_msg="input tensor type should be string.", token=1.0) | |||
| test_invalid_input("invalid lower_case_backup parameter type as a string", DATASET_ROOT_PATH + "char_n_gram_20.txt", | |||
| error=TypeError, error_msg="Argument lower_case_backup with " + | |||
| "value True is not of type [<class 'bool'>]," | |||
| " but got <class 'str'>.", lower_case_backup="True") | |||
| test_invalid_input("invalid lower_case_backup parameter type as a string", DATASET_ROOT_PATH + "char_n_gram_20.txt", | |||
| error=TypeError, error_msg="Argument lower_case_backup with " + | |||
| "value True is not of type [<class 'bool'>]," | |||
| " but got <class 'str'>.", lower_case_backup="True") | |||
| if __name__ == '__main__': | |||
| test_char_n_gram_all_to_vectors_params_eager() | |||
| test_char_n_gram_build_from_file() | |||
| test_char_n_gram_all_build_from_file_params() | |||
| test_char_n_gram_all_build_from_file_params_eager() | |||
| test_char_n_gram_build_from_file_eager() | |||
| test_char_n_gram_invalid_input() | |||
| @@ -42,7 +42,6 @@ def test_fast_text_all_build_from_file_params(): | |||
| [0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246], | |||
| [0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923], | |||
| [0, 0, 0, 0, 0, 0]] | |||
| print(data) | |||
| for d in data.create_dict_iterator(num_epochs=1, output_numpy=True): | |||
| res_array = np.array(res[ind], dtype=np.float32) | |||
| assert np.array_equal(res_array, d["text"]), ind | |||
| @@ -135,7 +134,6 @@ def test_fast_text_build_from_file(): | |||
| [0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246], | |||
| [0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923], | |||
| [0, 0, 0, 0, 0, 0]] | |||
| print(data) | |||
| for d in data.create_dict_iterator(num_epochs=1, output_numpy=True): | |||
| res_array = np.array(res[ind], dtype=np.float32) | |||
| assert np.array_equal(res_array, d["text"]), ind | |||
| @@ -42,7 +42,6 @@ def test_glove_all_build_from_file_params(): | |||
| [0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246], | |||
| [0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923], | |||
| [0, 0, 0, 0, 0, 0]] | |||
| print(data) | |||
| for d in data.create_dict_iterator(num_epochs=1, output_numpy=True): | |||
| res_array = np.array(res[ind], dtype=np.float32) | |||
| assert np.array_equal(res_array, d["text"]), ind | |||
| @@ -135,7 +134,6 @@ def test_glove_build_from_file(): | |||
| [0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246], | |||
| [0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923], | |||
| [0, 0, 0, 0, 0, 0]] | |||
| print(data) | |||
| for d in data.create_dict_iterator(num_epochs=1, output_numpy=True): | |||
| res_array = np.array(res[ind], dtype=np.float32) | |||
| assert np.array_equal(res_array, d["text"]), ind | |||
| @@ -101,7 +101,6 @@ def test_vectors_from_file_all_buildfromfile_params(): | |||
| [0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246], | |||
| [0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923], | |||
| [0, 0, 0, 0, 0, 0]] | |||
| print(data) | |||
| for d in data.create_dict_iterator(num_epochs=1, output_numpy=True): | |||
| res_array = np.array(res[ind], dtype=np.float32) | |||
| assert np.array_equal(res_array, d["text"]), ind | |||