| @@ -19,6 +19,7 @@ | |||
| #include "minddata/dataset/api/python/pybind_register.h" | |||
| #include "minddata/dataset/include/dataset/constants.h" | |||
| #include "minddata/dataset/text/fast_text.h" | |||
| #include "minddata/dataset/text/sentence_piece_vocab.h" | |||
| #include "minddata/dataset/text/vectors.h" | |||
| #include "minddata/dataset/text/vocab.h" | |||
| @@ -88,6 +89,16 @@ PYBIND_REGISTER(SentencePieceModel, 0, ([](const py::module *m) { | |||
| .export_values(); | |||
| })); | |||
| PYBIND_REGISTER(FastText, 1, ([](const py::module *m) { | |||
| (void)py::class_<FastText, Vectors, std::shared_ptr<FastText>>(*m, "FastText") | |||
| .def(py::init<>()) | |||
| .def_static("from_file", [](const std::string &path, int32_t max_vectors) { | |||
| std::shared_ptr<FastText> fast_text; | |||
| THROW_IF_ERROR(FastText::BuildFromFile(&fast_text, path, max_vectors)); | |||
| return fast_text; | |||
| }); | |||
| })); | |||
| PYBIND_REGISTER(Vectors, 0, ([](const py::module *m) { | |||
| (void)py::class_<Vectors, std::shared_ptr<Vectors>>(*m, "Vectors") | |||
| .def(py::init<>()) | |||
| @@ -630,7 +630,7 @@ class MS_API ToNumber final : public TensorTransform { | |||
| }; | |||
| /// \brief Look up a token into an vector according to the input Vectors table. | |||
| class ToVectors final : public TensorTransform { | |||
| class MS_API ToVectors final : public TensorTransform { | |||
| public: | |||
| /// \brief Constructor. | |||
| /// \param[in] vectors A Vectors object. | |||
| @@ -4,9 +4,10 @@ add_subdirectory(kernels) | |||
| file(GLOB _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc") | |||
| set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD) | |||
| add_library(text OBJECT | |||
| fast_text.cc | |||
| sentence_piece_vocab.cc | |||
| vectors.cc | |||
| vocab.cc | |||
| sentence_piece_vocab.cc | |||
| ) | |||
| add_dependencies(text text-kernels) | |||
| @@ -0,0 +1,50 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "minddata/dataset/text/fast_text.h" | |||
| #include "utils/file_utils.h" | |||
| namespace mindspore { | |||
| namespace dataset { | |||
| FastText::FastText(const std::unordered_map<std::string, std::vector<float>> &map, int dim) : Vectors(map, dim) {} | |||
| Status CheckFastText(const std::string &file_path) { | |||
| Path path = Path(file_path); | |||
| if (path.Exists() && !path.IsDirectory()) { | |||
| std::string basename = path.Basename(); | |||
| size_t dot = basename.rfind('.'); | |||
| std::string suffix = basename.substr(dot + 1); | |||
| if (suffix != "vec") { | |||
| RETURN_STATUS_UNEXPECTED("FastText: invalid file, can not find file '*.vec', but got: " + file_path); | |||
| } | |||
| return Status::OK(); | |||
| } else { | |||
| RETURN_STATUS_UNEXPECTED("FastText: invalid file, failed to open FastText file."); | |||
| } | |||
| } | |||
| Status FastText::BuildFromFile(std::shared_ptr<FastText> *fast_text, const std::string &path, int32_t max_vectors) { | |||
| RETURN_UNEXPECTED_IF_NULL(fast_text); | |||
| RETURN_IF_NOT_OK(CheckFastText(path)); | |||
| std::unordered_map<std::string, std::vector<float>> map; | |||
| int vector_dim = -1; | |||
| RETURN_IF_NOT_OK(Load(path, max_vectors, &map, &vector_dim)); | |||
| *fast_text = std::make_shared<FastText>(std::move(map), vector_dim); | |||
| return Status::OK(); | |||
| } | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| @@ -0,0 +1,55 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_FAST_TEXT_H_ | |||
| #define MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_FAST_TEXT_H_ | |||
| #include <memory> | |||
| #include <string> | |||
| #include <unordered_map> | |||
| #include <utility> | |||
| #include <vector> | |||
| #include "minddata/dataset/core/tensor.h" | |||
| #include "minddata/dataset/include/dataset/iterator.h" | |||
| #include "minddata/dataset/text/vectors.h" | |||
| #include "minddata/dataset/util/path.h" | |||
| namespace mindspore { | |||
| namespace dataset { | |||
| /// \brief Pre-train word vectors. | |||
| class FastText : public Vectors { | |||
| public: | |||
| /// Constructor. | |||
| FastText() = default; | |||
| /// Constructor. | |||
| /// \param[in] map A map between string and vector. | |||
| /// \param[in] dim Dimension of the vectors. | |||
| FastText(const std::unordered_map<std::string, std::vector<float>> &map, int dim); | |||
| /// Destructor. | |||
| ~FastText() = default; | |||
| /// \brief Build Vectors from reading a pre-train vector file. | |||
| /// \param[out] fast_text FastText object which contains the pre-train vectors. | |||
| /// \param[in] path Path to the pre-trained word vector file. The suffix of set must be `*.vec`. | |||
| /// \param[in] max_vectors This can be used to limit the number of pre-trained vectors loaded (default=0, no limit). | |||
| static Status BuildFromFile(std::shared_ptr<FastText> *fast_text, const std::string &path, int32_t max_vectors = 0); | |||
| }; | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_FAST_TEXT_H_ | |||
| @@ -30,6 +30,8 @@ namespace dataset { | |||
| class Vectors; | |||
| class Vocab; | |||
| class SentencePieceVocab; | |||
| class Vectors; | |||
| class Vocab; | |||
| // Transform operations for text | |||
| namespace text { | |||
| @@ -28,13 +28,13 @@ import platform | |||
| from .transforms import Lookup, JiebaTokenizer, UnicodeCharTokenizer, Ngram, WordpieceTokenizer, \ | |||
| TruncateSequencePair, ToNumber, SlidingWindow, SentencePieceTokenizer, PythonTokenizer, ToVectors | |||
| from .utils import to_str, to_bytes, JiebaMode, Vocab, NormalizeForm, SentencePieceVocab, SentencePieceModel, \ | |||
| SPieceTokenizerOutType, SPieceTokenizerLoadType, Vectors | |||
| SPieceTokenizerOutType, SPieceTokenizerLoadType, Vectors, FastText | |||
| __all__ = [ | |||
| "Lookup", "JiebaTokenizer", "UnicodeCharTokenizer", "Ngram", | |||
| "to_str", "to_bytes", "Vocab", "WordpieceTokenizer", "TruncateSequencePair", "ToNumber", | |||
| "PythonTokenizer", "SlidingWindow", "SentencePieceVocab", "SentencePieceTokenizer", "SPieceTokenizerOutType", | |||
| "SentencePieceModel", "SPieceTokenizerLoadType", "JiebaMode", "NormalizeForm", "Vectors", "ToVectors" | |||
| "SentencePieceModel", "SPieceTokenizerLoadType", "JiebaMode", "NormalizeForm", "Vectors", "ToVectors", "FastText" | |||
| ] | |||
| if platform.system().lower() != 'windows': | |||
| @@ -27,7 +27,7 @@ from .validators import check_from_file, check_from_list, check_from_dict, check | |||
| check_from_file_vectors | |||
| __all__ = [ | |||
| "Vocab", "SentencePieceVocab", "to_str", "to_bytes", "Vectors" | |||
| "Vocab", "SentencePieceVocab", "to_str", "to_bytes", "Vectors", "FastText" | |||
| ] | |||
| @@ -411,3 +411,30 @@ class Vectors(cde.Vectors): | |||
| max_vectors = max_vectors if max_vectors is not None else 0 | |||
| return super().from_file(file_path, max_vectors) | |||
| class FastText(cde.FastText): | |||
| """ | |||
| FastText object that is used to map tokens into vectors. | |||
| """ | |||
| @classmethod | |||
| @check_from_file_vectors | |||
| def from_file(cls, file_path, max_vectors=None): | |||
| """ | |||
| Build a FastText vector from a file. | |||
| Args: | |||
| file_path (str): Path of the file that contains the vectors. The shuffix of pre-trained vector sets | |||
| must be `*.vec`. | |||
| max_vectors (int, optional): This can be used to limit the number of pre-trained vectors loaded. | |||
| Most pre-trained vector sets are sorted in the descending order of word frequency. Thus, in | |||
| situations where the entire set doesn’t fit in memory, or is not needed for another reason, | |||
| passing max_vectors can limit the size of the loaded set (default=None, no limit). | |||
| Examples: | |||
| >>> fast_text = text.FastText.from_file("/path/to/fast_text/file", max_vectors=None) | |||
| """ | |||
| max_vectors = max_vectors if max_vectors is not None else 0 | |||
| return super().from_file(file_path, max_vectors) | |||
| @@ -23,11 +23,13 @@ | |||
| #include "minddata/dataset/include/dataset/datasets.h" | |||
| #include "minddata/dataset/include/dataset/text.h" | |||
| #include "minddata/dataset/include/dataset/transforms.h" | |||
| #include "minddata/dataset/text/fast_text.h" | |||
| #include "minddata/dataset/text/vectors.h" | |||
| #include "minddata/dataset/text/vocab.h" | |||
| using namespace mindspore::dataset; | |||
| using mindspore::Status; | |||
| using mindspore::dataset::FastText; | |||
| using mindspore::dataset::ShuffleMode; | |||
| using mindspore::dataset::Tensor; | |||
| using mindspore::dataset::Vectors; | |||
| @@ -3943,3 +3945,357 @@ TEST_F(MindDataTestPipeline, TestVectorsWithWrongInfoFile) { | |||
| Status s = Vectors::BuildFromFile(&vectors, vectors_dir); | |||
| EXPECT_NE(s, Status::OK()); | |||
| } | |||
| /// Feature: FastText | |||
| /// Description: test with default parameter in function BuildFromFile and function Lookup | |||
| /// Expectation: return correct MSTensor which is equal to the expected | |||
| TEST_F(MindDataTestPipeline, TestFastTextDefaultParam) { | |||
| // Test with default parameter. | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextDefaultParam."; | |||
| // Create a TextFile dataset | |||
| std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt"; | |||
| std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); | |||
| EXPECT_NE(ds, nullptr); | |||
| std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fast_text.vec"; | |||
| std::shared_ptr<FastText> fast_text; | |||
| Status s = FastText::BuildFromFile(&fast_text, vectors_dir); | |||
| EXPECT_EQ(s, Status::OK()); | |||
| std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(fast_text); | |||
| EXPECT_NE(lookup, nullptr); | |||
| // Create Map operation on ds | |||
| ds = ds->Map({lookup}, {"text"}); | |||
| EXPECT_NE(ds, nullptr); | |||
| // Create an iterator over the result of the above dataset | |||
| std::shared_ptr<Iterator> iter = ds->CreateIterator(); | |||
| EXPECT_NE(iter, nullptr); | |||
| // Iterate the dataset and get each row | |||
| std::unordered_map<std::string, mindspore::MSTensor> row; | |||
| ASSERT_OK(iter->GetNextRow(&row)); | |||
| uint64_t i = 0; | |||
| std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411}, | |||
| {0, 0, 0, 0, 0, 0}, | |||
| {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973}, | |||
| {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603}, | |||
| {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246}, | |||
| {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923}, | |||
| {0, 0, 0, 0, 0, 0}}; | |||
| while (row.size() != 0) { | |||
| auto ind = row["text"]; | |||
| MS_LOG(INFO) << ind.Shape(); | |||
| TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind); | |||
| TensorPtr de_expected_item; | |||
| dsize_t dim = 6; | |||
| ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item)); | |||
| mindspore::MSTensor ms_expected_item = | |||
| mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item)); | |||
| EXPECT_MSTENSOR_EQ(ind, ms_expected_item); | |||
| ASSERT_OK(iter->GetNextRow(&row)); | |||
| i++; | |||
| } | |||
| EXPECT_EQ(i, 7); | |||
| // Manually terminate the pipeline | |||
| iter->Stop(); | |||
| } | |||
| /// Feature: FastText | |||
| /// Description: test with all parameters which include `path` and `max_vector` in function BuildFromFile | |||
| /// Expectation: return correct MSTensor which is equal to the expected | |||
| TEST_F(MindDataTestPipeline, TestFastTextAllBuildfromfileParams) { | |||
| // Test with two parameters. | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextAllBuildfromfileParams."; | |||
| // Create a TextFile dataset | |||
| std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt"; | |||
| std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); | |||
| EXPECT_NE(ds, nullptr); | |||
| std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fast_text.vec"; | |||
| std::shared_ptr<FastText> fast_text; | |||
| Status s = FastText::BuildFromFile(&fast_text, vectors_dir, 100); | |||
| EXPECT_EQ(s, Status::OK()); | |||
| std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(fast_text); | |||
| EXPECT_NE(lookup, nullptr); | |||
| // Create Map operation on ds | |||
| ds = ds->Map({lookup}, {"text"}); | |||
| EXPECT_NE(ds, nullptr); | |||
| // Create an iterator over the result of the above dataset | |||
| std::shared_ptr<Iterator> iter = ds->CreateIterator(); | |||
| EXPECT_NE(iter, nullptr); | |||
| // Iterate the dataset and get each row | |||
| std::unordered_map<std::string, mindspore::MSTensor> row; | |||
| ASSERT_OK(iter->GetNextRow(&row)); | |||
| uint64_t i = 0; | |||
| std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411}, | |||
| {0, 0, 0, 0, 0, 0}, | |||
| {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973}, | |||
| {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603}, | |||
| {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246}, | |||
| {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923}, | |||
| {0, 0, 0, 0, 0, 0}}; | |||
| while (row.size() != 0) { | |||
| auto ind = row["text"]; | |||
| MS_LOG(INFO) << ind.Shape(); | |||
| TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind); | |||
| TensorPtr de_expected_item; | |||
| dsize_t dim = 6; | |||
| ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item)); | |||
| mindspore::MSTensor ms_expected_item = | |||
| mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item)); | |||
| EXPECT_MSTENSOR_EQ(ind, ms_expected_item); | |||
| ASSERT_OK(iter->GetNextRow(&row)); | |||
| i++; | |||
| } | |||
| EXPECT_EQ(i, 7); | |||
| // Manually terminate the pipeline | |||
| iter->Stop(); | |||
| } | |||
| /// Feature: FastText | |||
| /// Description: test with all parameters in function BuildFromFile and `unknown_init` in function Lookup | |||
| /// Expectation: return correct MSTensor which is equal to the expected | |||
| TEST_F(MindDataTestPipeline, TestFastTextUnknownInit) { | |||
| // Test with two parameters. | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextUnknownInit."; | |||
| // Create a TextFile dataset | |||
| std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt"; | |||
| std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); | |||
| EXPECT_NE(ds, nullptr); | |||
| std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fast_text.vec"; | |||
| std::shared_ptr<FastText> fast_text; | |||
| Status s = FastText::BuildFromFile(&fast_text, vectors_dir, 100); | |||
| EXPECT_EQ(s, Status::OK()); | |||
| std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1}; | |||
| std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(fast_text, unknown_init); | |||
| EXPECT_NE(lookup, nullptr); | |||
| // Create Map operation on ds | |||
| ds = ds->Map({lookup}, {"text"}); | |||
| EXPECT_NE(ds, nullptr); | |||
| // Create an iterator over the result of the above dataset | |||
| std::shared_ptr<Iterator> iter = ds->CreateIterator(); | |||
| EXPECT_NE(iter, nullptr); | |||
| // Iterate the dataset and get each row | |||
| std::unordered_map<std::string, mindspore::MSTensor> row; | |||
| ASSERT_OK(iter->GetNextRow(&row)); | |||
| uint64_t i = 0; | |||
| std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411}, | |||
| {-1, -1, -1, -1, -1, -1}, | |||
| {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973}, | |||
| {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603}, | |||
| {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246}, | |||
| {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923}, | |||
| {-1, -1, -1, -1, -1, -1}}; | |||
| while (row.size() != 0) { | |||
| auto ind = row["text"]; | |||
| MS_LOG(INFO) << ind.Shape(); | |||
| TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind); | |||
| TensorPtr de_expected_item; | |||
| dsize_t dim = 6; | |||
| ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item)); | |||
| mindspore::MSTensor ms_expected_item = | |||
| mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item)); | |||
| EXPECT_MSTENSOR_EQ(ind, ms_expected_item); | |||
| ASSERT_OK(iter->GetNextRow(&row)); | |||
| i++; | |||
| } | |||
| EXPECT_EQ(i, 7); | |||
| // Manually terminate the pipeline | |||
| iter->Stop(); | |||
| } | |||
| /// Feature: FastText | |||
| /// Description: test with all parameters which include `path` and `max_vectors` in function BuildFromFile and `token`, | |||
| /// `unknown_init` and `lower_case_backup` in function Lookup. But some tokens have some big letters | |||
| /// Expectation: return correct MSTensor which is equal to the expected | |||
| TEST_F(MindDataTestPipeline, TestFastTextAllParams) { | |||
| // Test with all parameters. | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextAllParams."; | |||
| // Create a TextFile dataset | |||
| std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt"; | |||
| std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); | |||
| EXPECT_NE(ds, nullptr); | |||
| std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fast_text.vec"; | |||
| std::shared_ptr<FastText> fast_text; | |||
| Status s = FastText::BuildFromFile(&fast_text, vectors_dir); | |||
| EXPECT_EQ(s, Status::OK()); | |||
| std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1}; | |||
| std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(fast_text, unknown_init, true); | |||
| EXPECT_NE(lookup, nullptr); | |||
| // Create Map operation on ds | |||
| ds = ds->Map({lookup}, {"text"}); | |||
| EXPECT_NE(ds, nullptr); | |||
| // Create an iterator over the result of the above dataset | |||
| std::shared_ptr<Iterator> iter = ds->CreateIterator(); | |||
| EXPECT_NE(iter, nullptr); | |||
| // Iterate the dataset and get each row | |||
| std::unordered_map<std::string, mindspore::MSTensor> row; | |||
| ASSERT_OK(iter->GetNextRow(&row)); | |||
| uint64_t i = 0; | |||
| std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411}, | |||
| {-1, -1, -1, -1, -1, -1}, | |||
| {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973}, | |||
| {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603}, | |||
| {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246}, | |||
| {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923}, | |||
| {-1, -1, -1, -1, -1, -1}}; | |||
| while (row.size() != 0) { | |||
| auto ind = row["text"]; | |||
| MS_LOG(INFO) << ind.Shape(); | |||
| TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind); | |||
| TensorPtr de_expected_item; | |||
| dsize_t dim = 6; | |||
| ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item)); | |||
| mindspore::MSTensor ms_expected_item = | |||
| mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item)); | |||
| EXPECT_MSTENSOR_EQ(ind, ms_expected_item); | |||
| ASSERT_OK(iter->GetNextRow(&row)); | |||
| i++; | |||
| } | |||
| EXPECT_EQ(i, 7); | |||
| // Manually terminate the pipeline | |||
| iter->Stop(); | |||
| } | |||
| /// Feature: FastText | |||
| /// Description: test with pre-vectors set that have the different dimension | |||
| /// Expectation: throw correct error and message | |||
| TEST_F(MindDataTestPipeline, TestFastTextDifferentDimension) { | |||
| // Tokens don't have the same number of vectors. | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextDifferentDimension."; | |||
| // Create a TextFile dataset | |||
| std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt"; | |||
| std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); | |||
| EXPECT_NE(ds, nullptr); | |||
| std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fasttext_dim_different.vec"; | |||
| std::shared_ptr<FastText> fast_text; | |||
| Status s = FastText::BuildFromFile(&fast_text, vectors_dir, 100); | |||
| EXPECT_NE(s, Status::OK()); | |||
| } | |||
| /// Feature: FastText | |||
| /// Description: test with the parameter max_vectors that is <= 0 | |||
| /// Expectation: throw correct error and message | |||
| TEST_F(MindDataTestPipeline, TestFastTextMaxVectorsLessThanZero) { | |||
| // Test with max_vectors <= 0. | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextMaxVectorsLessThanZero."; | |||
| // Create a TextFile dataset | |||
| std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt"; | |||
| std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); | |||
| EXPECT_NE(ds, nullptr); | |||
| std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fast_text.vec"; | |||
| std::shared_ptr<FastText> fast_text; | |||
| Status s = FastText::BuildFromFile(&fast_text, vectors_dir, -1); | |||
| EXPECT_NE(s, Status::OK()); | |||
| } | |||
| /// Feature: FastText | |||
| /// Description: test with the pre-vectors file that is empty | |||
| /// Expectation: throw correct error and message | |||
| TEST_F(MindDataTestPipeline, TestFastTextWithEmptyFile) { | |||
| // Read empty file. | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextWithEmptyFile."; | |||
| // Create a TextFile dataset | |||
| std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt"; | |||
| std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); | |||
| EXPECT_NE(ds, nullptr); | |||
| std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fasttext_empty.vec"; | |||
| std::shared_ptr<FastText> fast_text; | |||
| Status s = FastText::BuildFromFile(&fast_text, vectors_dir); | |||
| EXPECT_NE(s, Status::OK()); | |||
| } | |||
| /// Feature: FastText | |||
| /// Description: test with the pre-vectors file that is not exist | |||
| /// Expectation: throw correct error and message | |||
| TEST_F(MindDataTestPipeline, TestFastTextWithNotExistFile) { | |||
| // Test with not exist file. | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextWithNotExistFile."; | |||
| // Create a TextFile dataset | |||
| std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt"; | |||
| std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); | |||
| EXPECT_NE(ds, nullptr); | |||
| std::string vectors_dir = datasets_root_path_ + "/test_fast_text/no_fasttext.vec"; | |||
| std::shared_ptr<FastText> fast_text; | |||
| Status s = FastText::BuildFromFile(&fast_text, vectors_dir); | |||
| EXPECT_NE(s, Status::OK()); | |||
| } | |||
| /// Feature: FastText | |||
| /// Description: test with the pre-vectors set that has a situation that info-head is not the first line in the set | |||
| /// Expectation: throw correct error and message | |||
| TEST_F(MindDataTestPipeline, TestFastTextWithWrongInfoFile) { | |||
| // wrong info. | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextWithWrongInfoFile."; | |||
| // Create a TextFile dataset | |||
| std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt"; | |||
| std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); | |||
| EXPECT_NE(ds, nullptr); | |||
| std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fasttext_with_wrong_info.vec"; | |||
| std::shared_ptr<FastText> fast_text; | |||
| Status s = FastText::BuildFromFile(&fast_text, vectors_dir); | |||
| EXPECT_NE(s, Status::OK()); | |||
| } | |||
| /// Feature: FastText | |||
| /// Description: test with the pre-vectors set that has a wrong suffix | |||
| /// Expectation: throw correct error and message | |||
| TEST_F(MindDataTestPipeline, TestFastTextWithWrongSuffix) { | |||
| // wrong info. | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextWithWrongSuffix."; | |||
| // Create a TextFile dataset | |||
| std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt"; | |||
| std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); | |||
| EXPECT_NE(ds, nullptr); | |||
| std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fast_text.txt"; | |||
| std::shared_ptr<FastText> fast_text; | |||
| Status s = FastText::BuildFromFile(&fast_text, vectors_dir); | |||
| EXPECT_NE(s, Status::OK()); | |||
| } | |||
| @@ -23,11 +23,13 @@ | |||
| #include "minddata/dataset/include/dataset/vision.h" | |||
| #include "minddata/dataset/include/dataset/audio.h" | |||
| #include "minddata/dataset/include/dataset/text.h" | |||
| #include "minddata/dataset/text/fast_text.h" | |||
| #include "minddata/dataset/text/vectors.h" | |||
| #include "utils/log_adapter.h" | |||
| using namespace mindspore::dataset; | |||
| using mindspore::LogStream; | |||
| using mindspore::dataset::FastText; | |||
| using mindspore::dataset::Vectors; | |||
| using mindspore::ExceptionType::NoExceptionType; | |||
| using mindspore::MsLogLevel::INFO; | |||
| @@ -1665,6 +1667,140 @@ TEST_F(MindDataTestExecute, TestToVectorsWithInvalidParam) { | |||
| EXPECT_FALSE(status02.IsOk()); | |||
| } | |||
| /// Feature: FastText | |||
| /// Description: test basic usage of FastText and the ToVectors with default parameter | |||
| /// Expectation: get correct MSTensor | |||
| TEST_F(MindDataTestExecute, TestFastTextParam) { | |||
| MS_LOG(INFO) << "Doing MindDataTestExecute-TestFastTextParam."; | |||
| std::shared_ptr<Tensor> de_tensor; | |||
| Tensor::CreateScalar<std::string>("ok", &de_tensor); | |||
| auto token = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_tensor)); | |||
| mindspore::MSTensor lookup_result; | |||
| // Create expected output. | |||
| std::shared_ptr<Tensor> de_expected; | |||
| std::vector<float> expected = {0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411}; | |||
| dsize_t dim = 6; | |||
| ASSERT_OK(Tensor::CreateFromVector(expected, TensorShape({dim}), &de_expected)); | |||
| auto ms_expected = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected)); | |||
| // Transform params. | |||
| std::string vectors_dir = "data/dataset/test_fast_text/fast_text.vec"; | |||
| std::shared_ptr<FastText> fast_text01; | |||
| Status s01 = FastText::BuildFromFile(&fast_text01, vectors_dir); | |||
| EXPECT_EQ(s01, Status::OK()); | |||
| std::shared_ptr<TensorTransform> to_vectors01 = std::make_shared<text::ToVectors>(fast_text01); | |||
| auto transform01 = Execute({to_vectors01}); | |||
| Status status01 = transform01(token, &lookup_result); | |||
| EXPECT_MSTENSOR_EQ(lookup_result, ms_expected); | |||
| EXPECT_TRUE(status01.IsOk()); | |||
| std::shared_ptr<FastText> fast_text02; | |||
| Status s02 = FastText::BuildFromFile(&fast_text02, vectors_dir, 100); | |||
| EXPECT_EQ(s02, Status::OK()); | |||
| std::shared_ptr<TensorTransform> to_vectors02 = std::make_shared<text::ToVectors>(fast_text02); | |||
| auto transform02 = Execute({to_vectors02}); | |||
| Status status02 = transform02(token, &lookup_result); | |||
| EXPECT_MSTENSOR_EQ(lookup_result, ms_expected); | |||
| EXPECT_TRUE(status02.IsOk()); | |||
| std::shared_ptr<FastText> fast_text03; | |||
| Status s03 = FastText::BuildFromFile(&fast_text03, vectors_dir, 3); | |||
| EXPECT_EQ(s03, Status::OK()); | |||
| std::shared_ptr<TensorTransform> to_vectors03 = std::make_shared<text::ToVectors>(fast_text03); | |||
| auto transform03 = Execute({to_vectors03}); | |||
| Status status03 = transform03(token, &lookup_result); | |||
| EXPECT_MSTENSOR_EQ(lookup_result, ms_expected); | |||
| EXPECT_TRUE(status03.IsOk()); | |||
| } | |||
| /// Feature: ToVectors | |||
| /// Description: test basic usage of ToVectors and the FastText with default parameter | |||
| /// Expectation: get correct MSTensor | |||
| TEST_F(MindDataTestExecute, TestToVectorsParamForFastText) { | |||
| MS_LOG(INFO) << "Doing MindDataTestExecute-TestToVectorsParamForFastText."; | |||
| std::shared_ptr<Tensor> de_tensor01; | |||
| Tensor::CreateScalar<std::string>("none", &de_tensor01); | |||
| auto token01 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_tensor01)); | |||
| std::shared_ptr<Tensor> de_tensor02; | |||
| Tensor::CreateScalar<std::string>("ok", &de_tensor02); | |||
| auto token02 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_tensor02)); | |||
| std::shared_ptr<Tensor> de_tensor03; | |||
| Tensor::CreateScalar<std::string>("OK", &de_tensor03); | |||
| auto token03 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_tensor03)); | |||
| mindspore::MSTensor lookup_result; | |||
| // Create expected output. | |||
| dsize_t dim = 6; | |||
| std::shared_ptr<Tensor> de_expected01; | |||
| std::vector<float> expected01 = {0, 0, 0, 0, 0, 0}; | |||
| ASSERT_OK(Tensor::CreateFromVector(expected01, TensorShape({dim}), &de_expected01)); | |||
| auto ms_expected01 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected01)); | |||
| std::shared_ptr<Tensor> de_expected02; | |||
| std::vector<float> expected02 = {-1, -1, -1, -1, -1, -1}; | |||
| ASSERT_OK(Tensor::CreateFromVector(expected02, TensorShape({dim}), &de_expected02)); | |||
| auto ms_expected02 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected02)); | |||
| std::shared_ptr<Tensor> de_expected03; | |||
| std::vector<float> expected03 = {0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411}; | |||
| ASSERT_OK(Tensor::CreateFromVector(expected03, TensorShape({dim}), &de_expected03)); | |||
| auto ms_expected03 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected03)); | |||
| // Transform params. | |||
| std::string vectors_dir = "data/dataset/test_fast_text/fast_text.vec"; | |||
| std::shared_ptr<FastText> fast_text; | |||
| Status s = FastText::BuildFromFile(&fast_text, vectors_dir); | |||
| EXPECT_EQ(s, Status::OK()); | |||
| std::shared_ptr<TensorTransform> to_vectors01 = std::make_shared<text::ToVectors>(fast_text); | |||
| auto transform01 = Execute({to_vectors01}); | |||
| Status status01 = transform01(token01, &lookup_result); | |||
| EXPECT_MSTENSOR_EQ(lookup_result, ms_expected01); | |||
| EXPECT_TRUE(status01.IsOk()); | |||
| std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1}; | |||
| std::shared_ptr<TensorTransform> to_vectors02 = std::make_shared<text::ToVectors>(fast_text, unknown_init); | |||
| auto transform02 = Execute({to_vectors02}); | |||
| Status status02 = transform02(token01, &lookup_result); | |||
| EXPECT_MSTENSOR_EQ(lookup_result, ms_expected02); | |||
| EXPECT_TRUE(status02.IsOk()); | |||
| std::shared_ptr<TensorTransform> to_vectors03 = std::make_shared<text::ToVectors>(fast_text, unknown_init); | |||
| auto transform03 = Execute({to_vectors03}); | |||
| Status status03 = transform03(token02, &lookup_result); | |||
| EXPECT_MSTENSOR_EQ(lookup_result, ms_expected03); | |||
| EXPECT_TRUE(status03.IsOk()); | |||
| std::shared_ptr<TensorTransform> to_vectors04 = std::make_shared<text::ToVectors>(fast_text, unknown_init, true); | |||
| auto transform04 = Execute({to_vectors04}); | |||
| Status status04 = transform04(token03, &lookup_result); | |||
| EXPECT_MSTENSOR_EQ(lookup_result, ms_expected03); | |||
| EXPECT_TRUE(status04.IsOk()); | |||
| } | |||
| /// Feature: ToVectors | |||
| /// Description: test invalid parameter of ToVectors for FastText | |||
| /// Expectation: throw exception correctly | |||
| TEST_F(MindDataTestExecute, TestToVectorsWithInvalidParamForFastText) { | |||
| MS_LOG(INFO) << "Doing MindDataTestExecute-TestToVectorsWithInvalidParamForFastText."; | |||
| std::shared_ptr<Tensor> de_tensor; | |||
| Tensor::CreateScalar<std::string>("none", &de_tensor); | |||
| auto token = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_tensor)); | |||
| mindspore::MSTensor lookup_result; | |||
| // Transform params. | |||
| std::string vectors_dir = "data/dataset/test_fast_text/fast_text.vec"; | |||
| std::shared_ptr<FastText> fast_text01; | |||
| Status s = FastText::BuildFromFile(&fast_text01, vectors_dir); | |||
| EXPECT_EQ(s, Status::OK()); | |||
| std::vector<float> unknown_init = {-1, -1, -1, -1}; | |||
| std::shared_ptr<TensorTransform> to_vectors01 = std::make_shared<text::ToVectors>(fast_text01, unknown_init); | |||
| auto transform01 = Execute({to_vectors01}); | |||
| Status status01 = transform01(token, &lookup_result); | |||
| EXPECT_FALSE(status01.IsOk()); | |||
| std::shared_ptr<FastText> fast_text02 = nullptr; | |||
| std::shared_ptr<TensorTransform> to_vectors02 = std::make_shared<text::ToVectors>(fast_text02); | |||
| auto transform02 = Execute({to_vectors02}); | |||
| Status status02 = transform02(token, &lookup_result); | |||
| EXPECT_FALSE(status02.IsOk()); | |||
| } | |||
| // Feature: DBToAmplitude | |||
| // Description: test DBToAmplitude in eager mode | |||
| // Expectation: the data is processed successfully | |||
| @@ -0,0 +1,7 @@ | |||
| 6 6 | |||
| ok 0.418 0.24968 -0.41242 0.1217 0.34527 -0.04445718411 | |||
| ! 0.013441 0.23682 -0.16899 0.40951 0.63812 0.47709 | |||
| this 0.15164 0.30177 -0.16763 0.17684 0.31719 0.33973 | |||
| is 0.70853 0.57088 -0.4716 0.18048 0.54449 0.72603 | |||
| my 0.68047 -0.039263 0.30186 -0.17792 0.42962 0.032246 | |||
| home 0.26818 0.14346 -0.27877 0.016257 0.11384 0.69923 | |||
| @@ -0,0 +1,7 @@ | |||
| 6 6 | |||
| ok 0.418 0.24968 -0.41242 0.1217 0.34527 -0.04445718411 | |||
| ! 0.013441 0.23682 -0.16899 0.40951 0.63812 0.47709 | |||
| this 0.15164 0.30177 -0.16763 0.17684 0.31719 0.33973 | |||
| is 0.70853 0.57088 -0.4716 0.18048 0.54449 0.72603 | |||
| my 0.68047 -0.039263 0.30186 -0.17792 0.42962 0.032246 | |||
| home 0.26818 0.14346 -0.27877 0.016257 0.11384 0.69923 | |||
| @@ -0,0 +1,7 @@ | |||
| 6 6 | |||
| ok 0.418 0.24968 -0.41242 0.1217 0.34527 -0.04445718411 | |||
| ! 0.013441 0.23682 -0.16899 0.40951 0.63812 0.47709 | |||
| this 0.15164 0.30177 -0.16763 0.17684 0.31719 | |||
| is 0.70853 0.57088 -0.4716 0.18048 0.54449 0.72603 | |||
| my 0.68047 -0.039263 0.30186 -0.17792 0.42962 0.032246 | |||
| home 0.26818 0.14346 -0.27877 0.016257 0.11384 0.69923 | |||
| @@ -0,0 +1,7 @@ | |||
| the 0.418 0.24968 -0.41242 0.1217 0.34527 -0.04445718411 | |||
| , 0.013441 0.23682 -0.16899 0.40951 0.63812 0.47709 | |||
| . 0.15164 0.30177 -0.16763 0.17684 0.31719 0.33973 | |||
| 6 6 | |||
| of 0.70853 0.57088 -0.4716 0.18048 0.54449 0.72603 | |||
| to 0.68047 -0.039263 0.30186 -0.17792 0.42962 0.032246 | |||
| and 0.26818 0.14346 -0.27877 0.016257 0.11384 0.69923 | |||
| @@ -0,0 +1,7 @@ | |||
| ok | |||
| . | |||
| this | |||
| is | |||
| my | |||
| home | |||
| . | |||
| @@ -0,0 +1,7 @@ | |||
| ok | |||
| ! | |||
| This | |||
| iS | |||
| my | |||
| HOME | |||
| . | |||
| @@ -0,0 +1,237 @@ | |||
| # Copyright 2021 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================== | |||
| import numpy as np | |||
| import pytest | |||
| from mindspore import log | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.text as text | |||
| import mindspore.dataset.text.transforms as T | |||
| DATASET_ROOT_PATH = "../data/dataset/test_fast_text/" | |||
| def test_fast_text_all_build_from_file_params(): | |||
| """ | |||
| Feature: FastText | |||
| Description: test with all parameters which include `path` and `max_vector` in function BuildFromFile | |||
| Expectation: output is equal to the expected value | |||
| """ | |||
| vectors = text.FastText.from_file(DATASET_ROOT_PATH + "fast_text.vec", max_vectors=100) | |||
| to_vectors = text.ToVectors(vectors) | |||
| data = ds.TextFileDataset(DATASET_ROOT_PATH + "words.txt", shuffle=False) | |||
| data = data.map(operations=to_vectors, input_columns=["text"]) | |||
| ind = 0 | |||
| res = [[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411], | |||
| [0, 0, 0, 0, 0, 0], | |||
| [0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973], | |||
| [0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603], | |||
| [0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246], | |||
| [0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923], | |||
| [0, 0, 0, 0, 0, 0]] | |||
| print(data) | |||
| for d in data.create_dict_iterator(num_epochs=1, output_numpy=True): | |||
| res_array = np.array(res[ind], dtype=np.float32) | |||
| assert np.array_equal(res_array, d["text"]), ind | |||
| ind += 1 | |||
| def test_fast_text_all_build_from_file_params_eager(): | |||
| """ | |||
| Feature: FastText | |||
| Description: test with all parameters which include `path` and `max_vector` in function BuildFromFile in eager mode | |||
| Expectation: output is equal to the expected value | |||
| """ | |||
| vectors = text.FastText.from_file(DATASET_ROOT_PATH + "fast_text.vec", max_vectors=4) | |||
| to_vectors = T.ToVectors(vectors) | |||
| result1 = to_vectors("ok") | |||
| result2 = to_vectors("!") | |||
| result3 = to_vectors("this") | |||
| result4 = to_vectors("is") | |||
| result5 = to_vectors("my") | |||
| result6 = to_vectors("home") | |||
| result7 = to_vectors("none") | |||
| res = [[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411], | |||
| [0.013441, 0.23682, -0.16899, 0.40951, 0.63812, 0.47709], | |||
| [0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973], | |||
| [0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603], | |||
| [0, 0, 0, 0, 0, 0], | |||
| [0, 0, 0, 0, 0, 0], | |||
| [0, 0, 0, 0, 0, 0]] | |||
| res_array = np.array(res, dtype=np.float32) | |||
| assert np.array_equal(result1, res_array[0]) | |||
| assert np.array_equal(result2, res_array[1]) | |||
| assert np.array_equal(result3, res_array[2]) | |||
| assert np.array_equal(result4, res_array[3]) | |||
| assert np.array_equal(result5, res_array[4]) | |||
| assert np.array_equal(result6, res_array[5]) | |||
| assert np.array_equal(result7, res_array[6]) | |||
| def test_fast_text_all_to_vectors_params_eager(): | |||
| """ | |||
| Feature: FastText | |||
| Description: test with all parameters which include `unk_init` and `lower_case_backup` in function ToVectors | |||
| in eager mode | |||
| Expectation: output is equal to the expected value | |||
| """ | |||
| vectors = text.FastText.from_file(DATASET_ROOT_PATH + "fast_text.vec", max_vectors=4) | |||
| my_unk = [-1, -1, -1, -1, -1, -1] | |||
| to_vectors = T.ToVectors(vectors, unk_init=my_unk, lower_case_backup=True) | |||
| result1 = to_vectors("Ok") | |||
| result2 = to_vectors("!") | |||
| result3 = to_vectors("This") | |||
| result4 = to_vectors("is") | |||
| result5 = to_vectors("my") | |||
| result6 = to_vectors("home") | |||
| result7 = to_vectors("none") | |||
| res = [[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411], | |||
| [0.013441, 0.23682, -0.16899, 0.40951, 0.63812, 0.47709], | |||
| [0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973], | |||
| [0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603], | |||
| [-1, -1, -1, -1, -1, -1], | |||
| [-1, -1, -1, -1, -1, -1], | |||
| [-1, -1, -1, -1, -1, -1]] | |||
| res_array = np.array(res, dtype=np.float32) | |||
| assert np.array_equal(result1, res_array[0]) | |||
| assert np.array_equal(result2, res_array[1]) | |||
| assert np.array_equal(result3, res_array[2]) | |||
| assert np.array_equal(result4, res_array[3]) | |||
| assert np.array_equal(result5, res_array[4]) | |||
| assert np.array_equal(result6, res_array[5]) | |||
| assert np.array_equal(result7, res_array[6]) | |||
| def test_fast_text_build_from_file(): | |||
| """ | |||
| Feature: FastText | |||
| Description: test with only default parameter | |||
| Expectation: output is equal to the expected value | |||
| """ | |||
| vectors = text.FastText.from_file(DATASET_ROOT_PATH + "fast_text.vec") | |||
| to_vectors = text.ToVectors(vectors) | |||
| data = ds.TextFileDataset(DATASET_ROOT_PATH + "words.txt", shuffle=False) | |||
| data = data.map(operations=to_vectors, input_columns=["text"]) | |||
| ind = 0 | |||
| res = [[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411], | |||
| [0, 0, 0, 0, 0, 0], | |||
| [0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973], | |||
| [0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603], | |||
| [0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246], | |||
| [0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923], | |||
| [0, 0, 0, 0, 0, 0]] | |||
| print(data) | |||
| for d in data.create_dict_iterator(num_epochs=1, output_numpy=True): | |||
| res_array = np.array(res[ind], dtype=np.float32) | |||
| assert np.array_equal(res_array, d["text"]), ind | |||
| ind += 1 | |||
| def test_fast_text_build_from_file_eager(): | |||
| """ | |||
| Feature: FastText | |||
| Description: test with only default parameter in eager mode | |||
| Expectation: output is equal to the expected value | |||
| """ | |||
| vectors = text.FastText.from_file(DATASET_ROOT_PATH + "fast_text.vec") | |||
| to_vectors = T.ToVectors(vectors) | |||
| result1 = to_vectors("ok") | |||
| result2 = to_vectors("!") | |||
| result3 = to_vectors("this") | |||
| result4 = to_vectors("is") | |||
| result5 = to_vectors("my") | |||
| result6 = to_vectors("home") | |||
| result7 = to_vectors("none") | |||
| res = [[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411], | |||
| [0.013441, 0.23682, -0.16899, 0.40951, 0.63812, 0.47709], | |||
| [0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973], | |||
| [0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603], | |||
| [0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246], | |||
| [0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923], | |||
| [0, 0, 0, 0, 0, 0]] | |||
| res_array = np.array(res, dtype=np.float32) | |||
| assert np.array_equal(result1, res_array[0]) | |||
| assert np.array_equal(result2, res_array[1]) | |||
| assert np.array_equal(result3, res_array[2]) | |||
| assert np.array_equal(result4, res_array[3]) | |||
| assert np.array_equal(result5, res_array[4]) | |||
| assert np.array_equal(result6, res_array[5]) | |||
| assert np.array_equal(result7, res_array[6]) | |||
| def test_fast_text_invalid_input(): | |||
| """ | |||
| Feature: FastText | |||
| Description: test the validate function with invalid parameters | |||
| Expectation: output is equal to the expected error | |||
| """ | |||
| def test_invalid_input(test_name, file_path, error, error_msg, max_vectors=None, unk_init=None, | |||
| lower_case_backup=False, token="ok"): | |||
| log.info("Test Vectors with wrong input: {0}".format(test_name)) | |||
| with pytest.raises(error) as error_info: | |||
| vectors = text.FastText.from_file(file_path, max_vectors=max_vectors) | |||
| to_vectors = T.ToVectors(vectors, unk_init=unk_init, lower_case_backup=lower_case_backup) | |||
| to_vectors(token) | |||
| assert error_msg in str(error_info.value) | |||
| test_invalid_input("Not all vectors have the same number of dimensions", | |||
| DATASET_ROOT_PATH + "fast_text_dim_different.vec", error=RuntimeError, | |||
| error_msg="all vectors must have the same number of dimensions, " \ | |||
| "but got dim 5 while expecting 6") | |||
| test_invalid_input("the file is empty.", DATASET_ROOT_PATH + "fast_text_empty.vec", | |||
| error=RuntimeError, error_msg="invalid file, file is empty.") | |||
| test_invalid_input("the count of `unknown_init`'s element is different with word vector.", | |||
| DATASET_ROOT_PATH + "fast_text.vec", | |||
| error=RuntimeError, | |||
| error_msg="unk_init must be the same length as vectors, but got unk_init", | |||
| unk_init=[-1, -1]) | |||
| test_invalid_input("The file not exist", DATASET_ROOT_PATH + "not_exist.vec", RuntimeError, | |||
| error_msg="FastText: invalid file") | |||
| test_invalid_input("The token is 1-dimensional", DATASET_ROOT_PATH + "fast_text_with_wrong_info.vec", | |||
| error=RuntimeError, error_msg="token with 1-dimensional vector.") | |||
| test_invalid_input("max_vectors parameter must be greater than 0", DATASET_ROOT_PATH + "fast_text.vec", | |||
| error=ValueError, error_msg="Input max_vectors is not within the required interval", | |||
| max_vectors=-1) | |||
| test_invalid_input("invalid max_vectors parameter type as a float", DATASET_ROOT_PATH + "fast_text.vec", | |||
| error=TypeError, error_msg="Argument max_vectors with value 1.0 is not of type [<class 'int'>]," | |||
| " but got <class 'float'>.", max_vectors=1.0) | |||
| test_invalid_input("invalid max_vectors parameter type as a string", DATASET_ROOT_PATH + "fast_text.vec", | |||
| error=TypeError, error_msg="Argument max_vectors with value 1 is not of type [<class 'int'>]," | |||
| " but got <class 'str'>.", max_vectors="1") | |||
| test_invalid_input("invalid token parameter type as a float", DATASET_ROOT_PATH + "fast_text.vec", | |||
| error=RuntimeError, error_msg="input tensor type should be string.", token=1.0) | |||
| test_invalid_input("invalid lower_case_backup parameter type as a string", DATASET_ROOT_PATH + "fast_text.vec", | |||
| error=TypeError, error_msg="Argument lower_case_backup with value True is " \ | |||
| "not of type [<class 'bool'>]," | |||
| " but got <class 'str'>.", lower_case_backup="True") | |||
| test_invalid_input("invalid lower_case_backup parameter type as a string", DATASET_ROOT_PATH + "fast_text.vec", | |||
| error=TypeError, error_msg="Argument lower_case_backup with value True is " \ | |||
| "not of type [<class 'bool'>]," | |||
| " but got <class 'str'>.", lower_case_backup="True") | |||
| test_invalid_input("the suffix of pre-training set must be `*.vec`", DATASET_ROOT_PATH + "fast_text.txt", | |||
| error=RuntimeError, error_msg="FastText: invalid file, can not find file '*.vec'") | |||
| if __name__ == '__main__': | |||
| test_fast_text_all_build_from_file_params() | |||
| test_fast_text_all_build_from_file_params_eager() | |||
| test_fast_text_all_to_vectors_params_eager() | |||
| test_fast_text_build_from_file() | |||
| test_fast_text_build_from_file_eager() | |||
| test_fast_text_invalid_input() | |||