Browse Source

!21823 [assistant][ops] Add data operator CharNGram

Merge pull request !21823 from 张渝/CharNGram
tags/v1.6.0
i-robot Gitee 4 years ago
parent
commit
db9b66242b
22 changed files with 962 additions and 47 deletions
  1. +11
    -0
      mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/bindings.cc
  2. +1
    -0
      mindspore/ccsrc/minddata/dataset/text/CMakeLists.txt
  3. +98
    -0
      mindspore/ccsrc/minddata/dataset/text/char_n_gram.cc
  4. +64
    -0
      mindspore/ccsrc/minddata/dataset/text/char_n_gram.h
  5. +1
    -1
      mindspore/ccsrc/minddata/dataset/text/fast_text.cc
  6. +1
    -1
      mindspore/ccsrc/minddata/dataset/text/fast_text.h
  7. +1
    -1
      mindspore/ccsrc/minddata/dataset/text/glove.cc
  8. +1
    -1
      mindspore/ccsrc/minddata/dataset/text/glove.h
  9. +2
    -2
      mindspore/ccsrc/minddata/dataset/text/vectors.cc
  10. +4
    -4
      mindspore/ccsrc/minddata/dataset/text/vectors.h
  11. +2
    -2
      mindspore/dataset/text/__init__.py
  12. +27
    -1
      mindspore/dataset/text/utils.py
  13. +1
    -0
      tests/ut/cpp/dataset/CMakeLists.txt
  14. +346
    -22
      tests/ut/cpp/dataset/c_api_text_test.cc
  15. +138
    -0
      tests/ut/cpp/dataset/execute_test.cc
  16. +20
    -0
      tests/ut/data/dataset/testVectors/char_n_gram_20.txt
  17. +20
    -0
      tests/ut/data/dataset/testVectors/char_n_gram_20_dim_different.txt
  18. +7
    -7
      tests/ut/data/dataset/testVectors/words_with_big_letter.txt
  19. +217
    -0
      tests/ut/python/dataset/test_char_n_gram.py
  20. +0
    -2
      tests/ut/python/dataset/test_fast_text.py
  21. +0
    -2
      tests/ut/python/dataset/test_glove.py
  22. +0
    -1
      tests/ut/python/dataset/test_vectors.py

+ 11
- 0
mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/bindings.cc View File

@@ -19,6 +19,7 @@

#include "minddata/dataset/api/python/pybind_register.h"
#include "minddata/dataset/include/dataset/constants.h"
#include "minddata/dataset/text/char_n_gram.h"
#include "minddata/dataset/text/fast_text.h"
#include "minddata/dataset/text/glove.h"
#include "minddata/dataset/text/sentence_piece_vocab.h"
@@ -90,6 +91,16 @@ PYBIND_REGISTER(SentencePieceModel, 0, ([](const py::module *m) {
.export_values();
}));

PYBIND_REGISTER(CharNGram, 1, ([](const py::module *m) {
(void)py::class_<CharNGram, Vectors, std::shared_ptr<CharNGram>>(*m, "CharNGram")
.def(py::init<>())
.def_static("from_file", [](const std::string &path, int32_t max_vectors) {
std::shared_ptr<CharNGram> char_n_gram;
THROW_IF_ERROR(CharNGram::BuildFromFile(&char_n_gram, path, max_vectors));
return char_n_gram;
});
}));

PYBIND_REGISTER(FastText, 1, ([](const py::module *m) {
(void)py::class_<FastText, Vectors, std::shared_ptr<FastText>>(*m, "FastText")
.def(py::init<>())


+ 1
- 0
mindspore/ccsrc/minddata/dataset/text/CMakeLists.txt View File

@@ -4,6 +4,7 @@ add_subdirectory(kernels)
file(GLOB _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
add_library(text OBJECT
char_n_gram.cc
fast_text.cc
glove.cc
sentence_piece_vocab.cc


+ 98
- 0
mindspore/ccsrc/minddata/dataset/text/char_n_gram.cc View File

@@ -0,0 +1,98 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "minddata/dataset/text/char_n_gram.h"
#include "utils/file_utils.h"
namespace mindspore {
namespace dataset {
CharNGram::CharNGram(const std::unordered_map<std::string, std::vector<float>> &map, int32_t dim) : Vectors(map, dim) {}
Status CharNGram::BuildFromFile(std::shared_ptr<CharNGram> *char_n_gram, const std::string &path, int32_t max_vectors) {
RETURN_UNEXPECTED_IF_NULL(char_n_gram);
std::unordered_map<std::string, std::vector<float>> map;
int vector_dim = -1;
RETURN_IF_NOT_OK(CharNGram::Load(path, max_vectors, &map, &vector_dim));
*char_n_gram = std::make_shared<CharNGram>(std::move(map), vector_dim);
return Status::OK();
}
std::vector<float> CharNGram::Lookup(const std::string &token, const std::vector<float> &unk_init,
bool lower_case_backup) {
std::vector<float> init_vec(dim_, 0);
if (!unk_init.empty()) {
if (unk_init.size() != dim_) {
MS_LOG(WARNING) << "CharNGram: size of unk_init is not the same as vectors, will initialize with zero vectors.";
} else {
init_vec = unk_init;
}
}
std::string lower_token = token;
if (lower_case_backup) {
std::transform(lower_token.begin(), lower_token.end(), lower_token.begin(), ::tolower);
}
std::vector<std::string> chars;
chars.push_back("#BEGIN#");
for (int i = 0; i < lower_token.length(); i++) {
std::string s;
s.push_back(lower_token[i]); // Convert a char type letter to a string type.
chars.push_back(s);
}
chars.push_back("#END#");
int len = chars.size();
int num_vectors = 0;
std::vector<float> vector_value_sum(dim_, 0);
std::vector<float> vector_value_temp;
// The length of meaningful characters in the pre-training file is 2, 3, 4.
const int slice_len[3] = {2, 3, 4};
const int slice_len_size = sizeof(slice_len) / sizeof(slice_len[0]);
for (int i = 0; i < slice_len_size; i++) {
int end = len - slice_len[i] + 1;
for (int pos = 0; pos < end; pos++) {
std::vector<std::string> gram_vec;
std::vector<std::string>::const_iterator first = chars.begin() + pos;
std::vector<std::string>::const_iterator second = first + slice_len[i];
gram_vec.assign(first, second);
std::string c = "";
std::string gram = std::accumulate(gram_vec.begin(), gram_vec.end(), c);
std::string gram_key = std::to_string(slice_len[i]) + "gram-" + gram;
auto str_index = map_.find(gram_key);
if (str_index == map_.end()) {
vector_value_temp = init_vec;
} else {
vector_value_temp = str_index->second;
}
if (vector_value_temp != init_vec) {
std::transform(vector_value_temp.begin(), vector_value_temp.end(), vector_value_sum.begin(),
vector_value_sum.begin(), std::plus<float>());
num_vectors++;
}
}
}
std::vector<float> vector_value(dim_, 0);
if (num_vectors > 0) {
std::transform(vector_value_sum.begin(), vector_value_sum.end(), vector_value.begin(),
[&num_vectors](float value) -> float { return value / num_vectors; });
return vector_value;
} else {
return init_vec;
}
}
} // namespace dataset
} // namespace mindspore

+ 64
- 0
mindspore/ccsrc/minddata/dataset/text/char_n_gram.h View File

@@ -0,0 +1,64 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_CHAR_N_GRAM_H_
#define MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_CHAR_N_GRAM_H_
#include <algorithm>
#include <functional>
#include <memory>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "minddata/dataset/text/vectors.h"
namespace mindspore {
namespace dataset {
/// \brief Build CharNGram vectors from reading a Pre-train word vectors.
class CharNGram : public Vectors {
public:
// Constructor.
CharNGram() = default;
/// Constructor.
/// \param[in] map A map between string and vector.
/// \param[in] dim Dimension of the vectors.
CharNGram(const std::unordered_map<std::string, std::vector<float>> &map, int32_t dim);
// Destructor.
~CharNGram() = default;
/// \brief Build CharNGram from reading a CharNGram pre-train vector file.
/// \param[out] char_n_gram CharNGram object which contains the pre-train vectors.
/// \param[in] path Path to the CharNGram pre-trained word vector file.
/// \param[in] max_vectors This can be used to limit the number of pre-trained vectors loaded (default=0, no limit).
static Status BuildFromFile(std::shared_ptr<CharNGram> *char_n_gram, const std::string &path,
int32_t max_vectors = 0);
/// \brief Look up embedding vectors of token.
/// \param[in] token A token to be looked up.
/// \param[in] unk_init In case of the token is out-of-vectors (OOV), the result will be initialized with `unk_init`.
/// (default={}, means to initialize with zero vectors).
/// \param[in] lower_case_backup Whether to look up the token in the lower case (Default = false).
/// \return The vector of the input token.
std::vector<float> Lookup(const std::string &token, const std::vector<float> &unk_init = {},
bool lower_case_backup = false);
};
} // namespace dataset
} // namespace mindspore
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_CHAR_N_GRAM_H_

+ 1
- 1
mindspore/ccsrc/minddata/dataset/text/fast_text.cc View File

@@ -20,7 +20,7 @@
namespace mindspore {
namespace dataset {
FastText::FastText(const std::unordered_map<std::string, std::vector<float>> &map, int dim) : Vectors(map, dim) {}
FastText::FastText(const std::unordered_map<std::string, std::vector<float>> &map, int32_t dim) : Vectors(map, dim) {}
Status CheckFastText(const std::string &file_path) {
Path path = Path(file_path);


+ 1
- 1
mindspore/ccsrc/minddata/dataset/text/fast_text.h View File

@@ -39,7 +39,7 @@ class FastText : public Vectors {
/// Constructor.
/// \param[in] map A map between string and vector.
/// \param[in] dim Dimension of the vectors.
FastText(const std::unordered_map<std::string, std::vector<float>> &map, int dim);
FastText(const std::unordered_map<std::string, std::vector<float>> &map, int32_t dim);
/// Destructor.
~FastText() = default;


+ 1
- 1
mindspore/ccsrc/minddata/dataset/text/glove.cc View File

@@ -20,7 +20,7 @@
namespace mindspore {
namespace dataset {
GloVe::GloVe(const std::unordered_map<std::string, std::vector<float>> &map, int dim) : Vectors(map, dim) {}
GloVe::GloVe(const std::unordered_map<std::string, std::vector<float>> &map, int32_t dim) : Vectors(map, dim) {}
Status CheckGloVe(const std::string &file_path) {
Path path = Path(file_path);


+ 1
- 1
mindspore/ccsrc/minddata/dataset/text/glove.h View File

@@ -39,7 +39,7 @@ class GloVe : public Vectors {
/// Constructor.
/// \param[in] map A map between string and vector.
/// \param[in] dim Dimension of the vectors.
GloVe(const std::unordered_map<std::string, std::vector<float>> &map, int dim);
GloVe(const std::unordered_map<std::string, std::vector<float>> &map, int32_t dim);
/// Destructor.
~GloVe() = default;


+ 2
- 2
mindspore/ccsrc/minddata/dataset/text/vectors.cc View File

@@ -60,7 +60,7 @@ Status Vectors::InferShape(const std::string &path, int32_t max_vectors, int32_t
}
Status Vectors::Load(const std::string &path, int32_t max_vectors,
std::unordered_map<std::string, std::vector<float>> *map, int *vector_dim) {
std::unordered_map<std::string, std::vector<float>> *map, int32_t *vector_dim) {
RETURN_UNEXPECTED_IF_NULL(map);
RETURN_UNEXPECTED_IF_NULL(vector_dim);
auto realpath = FileUtils::GetRealPath(common::SafeCStr(path));
@@ -107,7 +107,7 @@ Status Vectors::Load(const std::string &path, int32_t max_vectors,
return Status::OK();
}
Vectors::Vectors(const std::unordered_map<std::string, std::vector<float>> &map, int dim) {
Vectors::Vectors(const std::unordered_map<std::string, std::vector<float>> &map, int32_t dim) {
map_ = std::move(map);
dim_ = dim;
}


+ 4
- 4
mindspore/ccsrc/minddata/dataset/text/vectors.h View File

@@ -40,7 +40,7 @@ class Vectors {
/// Constructor.
/// \param[in] map A map between string and vector.
/// \param[in] dim Dimension of the vectors.
Vectors(const std::unordered_map<std::string, std::vector<float>> &map, int dim);
Vectors(const std::unordered_map<std::string, std::vector<float>> &map, int32_t dim);
/// Destructor.
virtual ~Vectors() = default;
@@ -61,7 +61,7 @@ class Vectors {
bool lower_case_backup = false);
/// \brief Getter of dimension.
const int &Dim() const { return dim_; }
const int32_t &Dim() const { return dim_; }
protected:
/// \brief Infer the shape of the pre-trained word vector file.
@@ -79,9 +79,9 @@ class Vectors {
/// \param[out] map The map between words and vectors.
/// \param[out] vector_dim The dimension of the vectors in the file.
static Status Load(const std::string &path, int32_t max_vectors,
std::unordered_map<std::string, std::vector<float>> *map, int *vector_dim);
std::unordered_map<std::string, std::vector<float>> *map, int32_t *vector_dim);
int dim_;
int32_t dim_;
std::unordered_map<std::string, std::vector<float>> map_;
};
} // namespace dataset


+ 2
- 2
mindspore/dataset/text/__init__.py View File

@@ -28,14 +28,14 @@ import platform
from .transforms import Lookup, JiebaTokenizer, UnicodeCharTokenizer, Ngram, WordpieceTokenizer, \
TruncateSequencePair, ToNumber, SlidingWindow, SentencePieceTokenizer, PythonTokenizer, ToVectors
from .utils import to_str, to_bytes, JiebaMode, Vocab, NormalizeForm, SentencePieceVocab, SentencePieceModel, \
SPieceTokenizerOutType, SPieceTokenizerLoadType, Vectors, FastText, GloVe
SPieceTokenizerOutType, SPieceTokenizerLoadType, Vectors, FastText, GloVe, CharNGram

__all__ = [
"Lookup", "JiebaTokenizer", "UnicodeCharTokenizer", "Ngram",
"to_str", "to_bytes", "Vocab", "WordpieceTokenizer", "TruncateSequencePair", "ToNumber",
"PythonTokenizer", "SlidingWindow", "SentencePieceVocab", "SentencePieceTokenizer", "SPieceTokenizerOutType",
"SentencePieceModel", "SPieceTokenizerLoadType", "JiebaMode", "NormalizeForm", "Vectors", "ToVectors", "FastText",
"GloVe"
"GloVe", "CharNGram"
]

if platform.system().lower() != 'windows':


+ 27
- 1
mindspore/dataset/text/utils.py View File

@@ -27,7 +27,7 @@ from .validators import check_from_file, check_from_list, check_from_dict, check
check_from_file_vectors

__all__ = [
"Vocab", "SentencePieceVocab", "to_str", "to_bytes", "Vectors", "FastText", "GloVe"
"Vocab", "SentencePieceVocab", "to_str", "to_bytes", "Vectors", "FastText", "GloVe", "CharNGram"
]


@@ -465,3 +465,29 @@ class GloVe(cde.GloVe):

max_vectors = max_vectors if max_vectors is not None else 0
return super().from_file(file_path, max_vectors)


class CharNGram(cde.CharNGram):
"""
CharNGram object that is used to map tokens into pre-trained vectors.
"""

@classmethod
@check_from_file_vectors
def from_file(cls, file_path, max_vectors=None):
"""
Build a CharNGram vector from a file.

Args:
file_path (str): Path of the file that contains the CharNGram vectors.
max_vectors (int, optional): This can be used to limit the number of pre-trained vectors loaded.
Most pre-trained vector sets are sorted in the descending order of word frequency. Thus, in
situations where the entire set doesn’t fit in memory, or is not needed for another reason,
passing max_vectors can limit the size of the loaded set (default=None, no limit).

Examples:
>>> char_n_gram = text.CharNGram.from_file("/path/to/char_n_gram/file", max_vectors=None)
"""

max_vectors = max_vectors if max_vectors is not None else 0
return super().from_file(file_path, max_vectors)

+ 1
- 0
tests/ut/cpp/dataset/CMakeLists.txt View File

@@ -53,6 +53,7 @@ SET(DE_UT_SRCS
c_api_repeat_test.cc
c_api_samplers_test.cc
c_api_text_sentence_piece_vocab_test.cc
c_api_text_test.cc
c_api_text_vocab_test.cc
c_api_text_test.cc
c_api_transforms_test.cc


+ 346
- 22
tests/ut/cpp/dataset/c_api_text_test.cc View File

@@ -23,6 +23,7 @@
#include "minddata/dataset/include/dataset/datasets.h"
#include "minddata/dataset/include/dataset/text.h"
#include "minddata/dataset/include/dataset/transforms.h"
#include "minddata/dataset/text/char_n_gram.h"
#include "minddata/dataset/text/fast_text.h"
#include "minddata/dataset/text/glove.h"
#include "minddata/dataset/text/vectors.h"
@@ -30,6 +31,7 @@

using namespace mindspore::dataset;
using mindspore::Status;
using mindspore::dataset::CharNGram;
using mindspore::dataset::FastText;
using mindspore::dataset::GloVe;
using mindspore::dataset::ShuffleMode;
@@ -1658,7 +1660,8 @@ TEST_F(MindDataTestPipeline, TestToNumberFail1) {
EXPECT_NE(ds, nullptr);

// Create ToNumber operation on ds
std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeInt8);
std::shared_ptr<TensorTransform> to_number =
std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeInt8);
EXPECT_NE(to_number, nullptr);

// Create a Map operation on ds
@@ -3740,7 +3743,7 @@ TEST_F(MindDataTestPipeline, TestVectorsUnknownInit) {
/// `unknown_init` and `lower_case_backup` in function Lookup. But some tokens have some big letters
/// Expectation: return correct MSTensor which is equal to the expected
TEST_F(MindDataTestPipeline, TestVectorsAllParams) {
// Test with all parameters.
// Test with all parameters.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsAllParams.";
// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
@@ -3801,7 +3804,7 @@ TEST_F(MindDataTestPipeline, TestVectorsAllParams) {
/// Description: test with pre-vectors set that have the different dimension
/// Expectation: throw correct error and message
TEST_F(MindDataTestPipeline, TestVectorsDifferentDimension) {
// Tokens don't have the same number of vectors.
// Tokens don't have the same number of vectors.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsDifferentDimension.";

// Create a TextFile dataset
@@ -3819,7 +3822,7 @@ TEST_F(MindDataTestPipeline, TestVectorsDifferentDimension) {
/// Description: test with pre-vectors set that has the head-info
/// Expectation: return correct MSTensor which is equal to the expected
TEST_F(MindDataTestPipeline, TestVectorsWithHeadInfo) {
// Test with words that has head info.
// Test with words that has head info.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsWithHeadInfo.";
// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
@@ -3880,7 +3883,7 @@ TEST_F(MindDataTestPipeline, TestVectorsWithHeadInfo) {
/// Description: test with the parameter max_vectors that is <= 0
/// Expectation: throw correct error and message
TEST_F(MindDataTestPipeline, TestVectorsMaxVectorsLessThanZero) {
// Test with max_vectors <= 0.
// Test with max_vectors <= 0.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsMaxVectorsLessThanZero.";

// Create a TextFile dataset
@@ -3898,7 +3901,7 @@ TEST_F(MindDataTestPipeline, TestVectorsMaxVectorsLessThanZero) {
/// Description: test with the pre-vectors file that is empty
/// Expectation: throw correct error and message
TEST_F(MindDataTestPipeline, TestVectorsWithEmptyFile) {
// Read empty file.
// Read empty file.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsWithEmptyFile.";

// Create a TextFile dataset
@@ -3916,7 +3919,7 @@ TEST_F(MindDataTestPipeline, TestVectorsWithEmptyFile) {
/// Description: test with the pre-vectors file that is not exist
/// Expectation: throw correct error and message
TEST_F(MindDataTestPipeline, TestVectorsWithNotExistFile) {
// Test with not exist file.
// Test with not exist file.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsWithNotExistFile.";

// Create a TextFile dataset
@@ -3934,7 +3937,7 @@ TEST_F(MindDataTestPipeline, TestVectorsWithNotExistFile) {
/// Description: test with the pre-vectors set that has a situation that info-head is not the first line in the set
/// Expectation: throw correct error and message
TEST_F(MindDataTestPipeline, TestVectorsWithWrongInfoFile) {
// wrong info.
// Wrong info.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsWithWrongInfoFile.";

// Create a TextFile dataset
@@ -4137,7 +4140,7 @@ TEST_F(MindDataTestPipeline, TestFastTextUnknownInit) {
/// `unknown_init` and `lower_case_backup` in function Lookup. But some tokens have some big letters
/// Expectation: return correct MSTensor which is equal to the expected
TEST_F(MindDataTestPipeline, TestFastTextAllParams) {
// Test with all parameters.
// Test with all parameters.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextAllParams.";
// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
@@ -4198,7 +4201,7 @@ TEST_F(MindDataTestPipeline, TestFastTextAllParams) {
/// Description: test with pre-vectors set that have the different dimension
/// Expectation: throw correct error and message
TEST_F(MindDataTestPipeline, TestFastTextDifferentDimension) {
// Tokens don't have the same number of vectors.
// Tokens don't have the same number of vectors.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextDifferentDimension.";

// Create a TextFile dataset
@@ -4216,7 +4219,7 @@ TEST_F(MindDataTestPipeline, TestFastTextDifferentDimension) {
/// Description: test with the parameter max_vectors that is <= 0
/// Expectation: throw correct error and message
TEST_F(MindDataTestPipeline, TestFastTextMaxVectorsLessThanZero) {
// Test with max_vectors <= 0.
// Test with max_vectors <= 0.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextMaxVectorsLessThanZero.";

// Create a TextFile dataset
@@ -4234,7 +4237,7 @@ TEST_F(MindDataTestPipeline, TestFastTextMaxVectorsLessThanZero) {
/// Description: test with the pre-vectors file that is empty
/// Expectation: throw correct error and message
TEST_F(MindDataTestPipeline, TestFastTextWithEmptyFile) {
// Read empty file.
// Read empty file.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextWithEmptyFile.";

// Create a TextFile dataset
@@ -4252,7 +4255,7 @@ TEST_F(MindDataTestPipeline, TestFastTextWithEmptyFile) {
/// Description: test with the pre-vectors file that is not exist
/// Expectation: throw correct error and message
TEST_F(MindDataTestPipeline, TestFastTextWithNotExistFile) {
// Test with not exist file.
// Test with not exist file.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextWithNotExistFile.";

// Create a TextFile dataset
@@ -4270,7 +4273,7 @@ TEST_F(MindDataTestPipeline, TestFastTextWithNotExistFile) {
/// Description: test with the pre-vectors set that has a situation that info-head is not the first line in the set
/// Expectation: throw correct error and message
TEST_F(MindDataTestPipeline, TestFastTextWithWrongInfoFile) {
// wrong info.
// Wrong info.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextWithWrongInfoFile.";

// Create a TextFile dataset
@@ -4288,7 +4291,7 @@ TEST_F(MindDataTestPipeline, TestFastTextWithWrongInfoFile) {
/// Description: test with the pre-vectors set that has a wrong suffix
/// Expectation: throw correct error and message
TEST_F(MindDataTestPipeline, TestFastTextWithWrongSuffix) {
// wrong info.
// Wrong info.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextWithWrongSuffix.";

// Create a TextFile dataset
@@ -4497,7 +4500,7 @@ TEST_F(MindDataTestPipeline, TestGloVeUnknownInit) {
/// `unknown_init` and `lower_case_backup` in function Lookup. But some tokens have some big letters
/// Expectation: return correct MSTensor which is equal to the expected
TEST_F(MindDataTestPipeline, TestGloVeAllParams) {
// Test with all parameters.
// Test with all parameters.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestGloVeAllParams.";
// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/testGloVe/words.txt";
@@ -4560,7 +4563,7 @@ TEST_F(MindDataTestPipeline, TestGloVeAllParams) {
/// Description: test with pre-vectors set that have the different dimension
/// Expectation: throw correct error and message
TEST_F(MindDataTestPipeline, TestGloVeDifferentDimension) {
// Tokens don't have the same number of glove.
// Tokens don't have the same number of glove.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestGloVeDifferentDimension.";

// Create a TextFile dataset
@@ -4578,7 +4581,7 @@ TEST_F(MindDataTestPipeline, TestGloVeDifferentDimension) {
/// Description: test with the parameter max_vectors that is <= 0
/// Expectation: throw correct error and message
TEST_F(MindDataTestPipeline, TestGloVeMaxVectorsLessThanZero) {
// Test with max_vectors <= 0.
// Test with max_vectors <= 0.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestGloVeMaxVectorsLessThanZero.";

// Create a TextFile dataset
@@ -4596,7 +4599,7 @@ TEST_F(MindDataTestPipeline, TestGloVeMaxVectorsLessThanZero) {
/// Description: test with the pre-vectors file that is empty
/// Expectation: throw correct error and message
TEST_F(MindDataTestPipeline, TestGloVeWithEmptyFile) {
// Read empty file.
// Read empty file.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestGloVeWithEmptyFile.";

// Create a TextFile dataset
@@ -4614,7 +4617,7 @@ TEST_F(MindDataTestPipeline, TestGloVeWithEmptyFile) {
/// Description: test with the pre-vectors file that is not exist
/// Expectation: throw correct error and message
TEST_F(MindDataTestPipeline, TestGloVeWithNotExistFile) {
// Test with not exist file.
// Test with not exist file.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestGloVeWithNotExistFile.";

// Create a TextFile dataset
@@ -4632,7 +4635,7 @@ TEST_F(MindDataTestPipeline, TestGloVeWithNotExistFile) {
/// Description: test with the pre-vectors set that has a situation that info-head is not the first line in the set
/// Expectation: throw correct error and message
TEST_F(MindDataTestPipeline, TestGloVeWithWrongInfoFile) {
// wrong info.
// Wrong info.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestGloVeWithWrongInfoFile.";

// Create a TextFile dataset
@@ -4650,7 +4653,7 @@ TEST_F(MindDataTestPipeline, TestGloVeWithWrongInfoFile) {
/// Description: test with the pre-vectors set that has a wrong format
/// Expectation: throw correct error and message
TEST_F(MindDataTestPipeline, TestGloVeWithWrongFormat) {
// wrong info.
// Wrong info.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestGloVeWithWrongFormat.";

// Create a TextFile dataset
@@ -4663,3 +4666,324 @@ TEST_F(MindDataTestPipeline, TestGloVeWithWrongFormat) {
Status s = GloVe::BuildFromFile(&glove, vectors_dir);
EXPECT_NE(s, Status::OK());
}

/// Feature: CharNGram
/// Description: test with default parameter in function BuildFromFile and function Lookup
/// Expectation: return correct MSTensor which is equal to the excepted
TEST_F(MindDataTestPipeline, TestCharNGramDefaultParam) {
// Test with default parameter.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCharNGramDefaultParam.";

// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);

std::string vectors_dir = datasets_root_path_ + "/testVectors/char_n_gram_20.txt";
std::shared_ptr<CharNGram> char_n_gram;
Status s = CharNGram::BuildFromFile(&char_n_gram, vectors_dir);
EXPECT_EQ(s, Status::OK());
std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(char_n_gram);
EXPECT_NE(lookup, nullptr);

// Create Map operation on ds
ds = ds->Map({lookup}, {"text"});
EXPECT_NE(ds, nullptr);

// Create an iterator over the result of the above dataset
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);

// Iterate the dataset and get each row
std::unordered_map<std::string, mindspore::MSTensor> row;
ASSERT_OK(iter->GetNextRow(&row));

uint64_t i = 0;
std::vector<std::vector<float>> expected = {{0,0,0,0,0},
{0,0,0,0,0},
{0.117336,0.362446,-0.983326,0.939264,-0.05648},
{0.657201,2.11761,-1.59276,0.432072,1.21395},
{0,0,0,0,0},
{-2.26956,0.288491,-0.740001,0.661703,0.147355},
{0,0,0,0,0}};
while (row.size() != 0) {
auto ind = row["text"];
MS_LOG(INFO) << ind.Shape();
TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
TensorPtr de_expected_item;
ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_item));
mindspore::MSTensor ms_expected_item =
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
std::vector<int64_t> ind_shape = ind.Shape();
std::vector<int64_t> ms_expected_shape = ms_expected_item.Shape();
EXPECT_EQ(ind_shape, ms_expected_shape);

ASSERT_OK(iter->GetNextRow(&row));
i++;
}

EXPECT_EQ(i, 7);

// Manually terminate the pipeline
iter->Stop();
}

/// Feature: CharNGram.
/// Description: test with all parameters which include `path` and `max_vector` in function BuildFromFile
/// Expectation: return correct MSTensor which is equal to the excepted
TEST_F(MindDataTestPipeline, TestCharNGramAllBuildfromfileParams) {
// Test with two parameters.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCharNGramAllBuildfromfileParams.";

// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);

std::string vectors_dir = datasets_root_path_ + "/testVectors/char_n_gram_20.txt";
std::shared_ptr<CharNGram> char_n_gram;
Status s = CharNGram::BuildFromFile(&char_n_gram, vectors_dir, 18);
EXPECT_EQ(s, Status::OK());

std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(char_n_gram);
EXPECT_NE(lookup, nullptr);

// Create Map operation on ds
ds = ds->Map({lookup}, {"text"});
EXPECT_NE(ds, nullptr);

// Create an iterator over the result of the above dataset
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);

// Iterate the dataset and get each row
std::unordered_map<std::string, mindspore::MSTensor> row;
ASSERT_OK(iter->GetNextRow(&row));

uint64_t i = 0;
std::vector<std::vector<float>> expected = {{0,0,0,0,0},
{0,0,0,0,0},
{-0.155665,0.664073,-0.538499,1.22657,-0.2162},
{0.657201,2.11761,-1.59276,0.432072,1.21395},
{0,0,0,0,0},
{-2.26956,0.288491,-0.740001,0.661703,0.147355},
{0,0,0,0,0}};
while (row.size() != 0) {
auto ind = row["text"];
MS_LOG(INFO) << ind.Shape();
TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
TensorPtr de_expected_item;
ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_item));
mindspore::MSTensor ms_expected_item =
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
std::vector<int64_t> ind_shape = ind.Shape();
std::vector<int64_t> ms_expected_shape = ms_expected_item.Shape();
EXPECT_EQ(ind_shape, ms_expected_shape);

ASSERT_OK(iter->GetNextRow(&row));
i++;
}

EXPECT_EQ(i, 7);

// Manually terminate the pipeline
iter->Stop();
}

/// Feature: CharNGram
/// Description: test with all parameters in function BuildFromFile and `unknown_init` in function Lookup
/// Expectation: return correct MSTensor which is equal to the excepted
TEST_F(MindDataTestPipeline, TestCharNGramUnknownInit) {
// Test with two parameters.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCharNGramUnknownInit.";

// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);

std::string vectors_dir = datasets_root_path_ + "/testVectors/char_n_gram_20.txt";
std::shared_ptr<CharNGram> char_n_gram;
Status s = CharNGram::BuildFromFile(&char_n_gram, vectors_dir, 18);
EXPECT_EQ(s, Status::OK());

std::vector<float> unknown_init(5, -1);
std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(char_n_gram, unknown_init);
EXPECT_NE(lookup, nullptr);

// Create Map operation on ds
ds = ds->Map({lookup}, {"text"});
EXPECT_NE(ds, nullptr);

// Create an iterator over the result of the above dataset
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);

// Iterate the dataset and get each row
std::unordered_map<std::string, mindspore::MSTensor> row;
ASSERT_OK(iter->GetNextRow(&row));

uint64_t i = 0;
std::vector<std::vector<float>> expected = {{-1,-1,-1,-1,-1},
{-1,-1,-1,-1,-1},
{-0.155665,0.664073,-0.538499,1.22657,-0.2162},
{0.657201,2.11761,-1.59276,0.432072,1.21395},
{-1,-1,-1,-1,-1},
{-2.26956,0.288491,-0.740001,0.661703,0.147355},
{-1,-1,-1,-1,-1}};
while (row.size() != 0) {
auto ind = row["text"];
MS_LOG(INFO) << ind.Shape();
TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
TensorPtr de_expected_item;
ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_item));
mindspore::MSTensor ms_expected_item =
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
std::vector<int64_t> ind_shape = ind.Shape();
std::vector<int64_t> ms_expected_shape = ms_expected_item.Shape();
EXPECT_EQ(ind_shape, ms_expected_shape);

ASSERT_OK(iter->GetNextRow(&row));
i++;
}

EXPECT_EQ(i, 7);

// Manually terminate the pipeline
iter->Stop();
}

/// Feature: CharNGram
/// Description: test with all parameters which include `path` and `max_vectors` in function BuildFromFile and `token`,
/// `unknown_init` and `lower_case_backup` in function Lookup. But some tokens have some big letters
/// Expectation: return correct MSTensor which is equal to the excepted
TEST_F(MindDataTestPipeline, TestCharNGramAllParams) {
// Test with all parameters.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCharNGramAllParams.";
// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/testVectors/words_with_big_letter.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);

std::string vectors_dir = datasets_root_path_ + "/testVectors/char_n_gram_20.txt";
std::shared_ptr<CharNGram> char_n_gram;
Status s = CharNGram::BuildFromFile(&char_n_gram, vectors_dir);
EXPECT_EQ(s, Status::OK());

std::vector<float> unknown_init(5, -1);
std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(char_n_gram, unknown_init, true);
EXPECT_NE(lookup, nullptr);

// Create Map operation on ds
ds = ds->Map({lookup}, {"text"});
EXPECT_NE(ds, nullptr);

// Create an iterator over the result of the above dataset
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);

// Iterate the dataset and get each row
std::unordered_map<std::string, mindspore::MSTensor> row;
ASSERT_OK(iter->GetNextRow(&row));

uint64_t i = 0;
std::vector<std::vector<float>> expected = {{-1,-1,-1,-1,-1},
{-1,-1,-1,-1,-1},
{0.117336,0.362446,-0.983326,0.939264,-0.05648},
{0.657201,2.11761,-1.59276,0.432072,1.21395},
{-1,-1,-1,-1,-1},
{-2.26956,0.288491,-0.740001,0.661703,0.147355},
{-1,-1,-1,-1,-1}};
while (row.size() != 0) {
auto ind = row["text"];
MS_LOG(INFO) << ind.Shape();
TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
TensorPtr de_expected_item;
ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_item));
mindspore::MSTensor ms_expected_item =
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
std::vector<int64_t> ind_shape = ind.Shape();
std::vector<int64_t> ms_expected_shape = ms_expected_item.Shape();
EXPECT_EQ(ind_shape, ms_expected_shape);

ASSERT_OK(iter->GetNextRow(&row));
i++;
}

EXPECT_EQ(i, 7);

// Manually terminate the pipeline
iter->Stop();
}

/// Feature: CharNGram
/// Description: test with pre-vectors set that have the different dimension
/// Expectation: throw correct error and message
TEST_F(MindDataTestPipeline, TestCharNGramDifferentDimension) {
// Tokens don't have the same number of vectors.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCharNGramDifferentDimension.";

// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);

std::string vectors_dir = datasets_root_path_ + "/testVectors/char_n_gram_20_dim_different.txt";
std::shared_ptr<CharNGram> char_n_gram;
Status s = CharNGram::BuildFromFile(&char_n_gram, vectors_dir);
EXPECT_NE(s, Status::OK());
}

/// Feature: CharNGram
/// Description: test with the parameter max_vectors that is <= 0
/// Expectation: throw correct error and message
TEST_F(MindDataTestPipeline, TestCharNGramMaxVectorsLessThanZero) {
// Test with max_vectors <= 0.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCharNGramMaxVectorsLessThanZero.";

// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);

std::string vectors_dir = datasets_root_path_ + "/testVectors/char_n_gram_20.txt";
std::shared_ptr<CharNGram> char_n_gram;
Status s = CharNGram::BuildFromFile(&char_n_gram, vectors_dir, -1);
EXPECT_NE(s, Status::OK());
}

/// Feature: CharNGram
/// Description: test with the pre-vectors file that is empty
/// Expectation: throw correct error and message
TEST_F(MindDataTestPipeline, TestCharNGramWithEmptyFile) {
// Read empty file.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCharNGramWithEmptyFile.";

// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);

std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors_empty.txt";
std::shared_ptr<CharNGram> char_n_gram;
Status s = CharNGram::BuildFromFile(&char_n_gram, vectors_dir);
EXPECT_NE(s, Status::OK());
}

/// Feature: CharNGram
/// Description: test with the pre-vectors file that is not exist
/// Expectation: throw correct error and message
TEST_F(MindDataTestPipeline, TestCharNGramsWithNotExistFile) {
// Test with not exist file.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCharNGramsWithNotExistFile.";

// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);

std::string vectors_dir = datasets_root_path_ + "/testVectors/no_vectors.txt";
std::shared_ptr<CharNGram> char_n_gram;
Status s = CharNGram::BuildFromFile(&char_n_gram, vectors_dir);
EXPECT_NE(s, Status::OK());
}

+ 138
- 0
tests/ut/cpp/dataset/execute_test.cc View File

@@ -23,6 +23,7 @@
#include "minddata/dataset/include/dataset/vision.h"
#include "minddata/dataset/include/dataset/audio.h"
#include "minddata/dataset/include/dataset/text.h"
#include "minddata/dataset/text/char_n_gram.h"
#include "minddata/dataset/text/fast_text.h"
#include "minddata/dataset/text/glove.h"
#include "minddata/dataset/text/vectors.h"
@@ -30,6 +31,7 @@

using namespace mindspore::dataset;
using mindspore::LogStream;
using mindspore::dataset::CharNGram;
using mindspore::dataset::FastText;
using mindspore::dataset::GloVe;
using mindspore::dataset::Vectors;
@@ -1937,6 +1939,142 @@ TEST_F(MindDataTestExecute, TestToVectorsWithInvalidParamForGloVe) {
EXPECT_FALSE(status02.IsOk());
}

/// Feature: CharNGram
/// Description: test basic usage of CharNGram and the ToVectors with default parameter
/// Expectation: get correct MSTensor
TEST_F(MindDataTestExecute, TestCharNGramParam) {
MS_LOG(INFO) << "Doing MindDataTestExecute-TestCharNGramParam.";
std::shared_ptr<Tensor> de_tensor;
Tensor::CreateScalar<std::string>("the", &de_tensor);
auto token = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_tensor));
mindspore::MSTensor lookup_result;

// Create expected output.
std::shared_ptr<Tensor> de_expected01;
std::vector<float> expected01 = {-0.840079,-0.0270003,-0.833472,0.588367,-0.210012};
ASSERT_OK(Tensor::CreateFromVector(expected01, &de_expected01));
auto ms_expected01 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected01));
std::shared_ptr<Tensor> de_expected02;
std::vector<float> expected02 = {-1.34122,0.0442693,-0.48697,0.662939,-0.367669};
ASSERT_OK(Tensor::CreateFromVector(expected02, &de_expected02));
auto ms_expected02 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected02));

// Transform params.
std::string vectors_dir = "data/dataset/testVectors/char_n_gram_20.txt";
std::shared_ptr<CharNGram> char_n_gram01;
Status s01 = CharNGram::BuildFromFile(&char_n_gram01, vectors_dir);
EXPECT_EQ(s01, Status::OK());
std::shared_ptr<TensorTransform> to_vectors01 = std::make_shared<text::ToVectors>(char_n_gram01);
auto transform01 = Execute({to_vectors01});
Status status01 = transform01(token, &lookup_result);
EXPECT_EQ(lookup_result.Shape(), ms_expected01.Shape());
EXPECT_TRUE(status01.IsOk());

std::shared_ptr<CharNGram> char_n_gram02;
Status s02 = CharNGram::BuildFromFile(&char_n_gram02, vectors_dir, 100);
EXPECT_EQ(s02, Status::OK());
std::shared_ptr<TensorTransform> to_vectors02 = std::make_shared<text::ToVectors>(char_n_gram02);
auto transform02 = Execute({to_vectors02});
Status status02 = transform02(token, &lookup_result);
EXPECT_EQ(lookup_result.Shape(), ms_expected01.Shape());
EXPECT_TRUE(status02.IsOk());

std::shared_ptr<CharNGram> char_n_gram03;
Status s03 = CharNGram::BuildFromFile(&char_n_gram03, vectors_dir, 18);
EXPECT_EQ(s03, Status::OK());
std::shared_ptr<TensorTransform> to_vectors03 = std::make_shared<text::ToVectors>(char_n_gram03);
auto transform03 = Execute({to_vectors03});
Status status03 = transform03(token, &lookup_result);
EXPECT_EQ(lookup_result.Shape(), ms_expected02.Shape());
EXPECT_TRUE(status03.IsOk());
}

/// Feature: CharNGram
/// Description: test basic usage of ToVectors and the CharNGram with default parameter
/// Expectation: get correct MSTensor
TEST_F(MindDataTestExecute, TestToVectorsParamForCharNGram) {
MS_LOG(INFO) << "Doing MindDataTestExecute-TestToVectorsParamForCharNGram.";
std::shared_ptr<Tensor> de_tensor01;
Tensor::CreateScalar<std::string>("none", &de_tensor01);
auto token01 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_tensor01));
std::shared_ptr<Tensor> de_tensor02;
Tensor::CreateScalar<std::string>("the", &de_tensor02);
auto token02 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_tensor02));
std::shared_ptr<Tensor> de_tensor03;
Tensor::CreateScalar<std::string>("The", &de_tensor03);
auto token03 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_tensor03));
mindspore::MSTensor lookup_result;

// Create expected output.
std::shared_ptr<Tensor> de_expected01;
std::vector<float> expected01(5, 0);
ASSERT_OK(Tensor::CreateFromVector(expected01, &de_expected01));
auto ms_expected01 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected01));
std::shared_ptr<Tensor> de_expected02;
std::vector<float> expected02(5, -1);
ASSERT_OK(Tensor::CreateFromVector(expected02, &de_expected02));
auto ms_expected02 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected02));
std::shared_ptr<Tensor> de_expected03;
std::vector<float> expected03 = {-0.840079,-0.0270003,-0.833472,0.588367,-0.210012};
ASSERT_OK(Tensor::CreateFromVector(expected03, &de_expected03));
auto ms_expected03 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected03));

// Transform params.
std::string vectors_dir = "data/dataset/testVectors/char_n_gram_20.txt";
std::shared_ptr<CharNGram> char_n_gram;
Status s = CharNGram::BuildFromFile(&char_n_gram, vectors_dir);
EXPECT_EQ(s, Status::OK());

std::shared_ptr<TensorTransform> to_vectors01 = std::make_shared<text::ToVectors>(char_n_gram);
auto transform01 = Execute({to_vectors01});
Status status01 = transform01(token01, &lookup_result);
EXPECT_EQ(lookup_result.Shape(), ms_expected01.Shape());
EXPECT_TRUE(status01.IsOk());
std::vector<float> unknown_init(5, -1);
std::shared_ptr<TensorTransform> to_vectors02 = std::make_shared<text::ToVectors>(char_n_gram, unknown_init);
auto transform02 = Execute({to_vectors02});
Status status02 = transform02(token01, &lookup_result);
EXPECT_EQ(lookup_result.Shape(), ms_expected02.Shape());
EXPECT_TRUE(status02.IsOk());
std::shared_ptr<TensorTransform> to_vectors03 = std::make_shared<text::ToVectors>(char_n_gram, unknown_init);
auto transform03 = Execute({to_vectors03});
Status status03 = transform03(token02, &lookup_result);
EXPECT_EQ(lookup_result.Shape(), ms_expected03.Shape());
EXPECT_TRUE(status03.IsOk());
std::shared_ptr<TensorTransform> to_vectors04 = std::make_shared<text::ToVectors>(char_n_gram, unknown_init, true);
auto transform04 = Execute({to_vectors04});
Status status04 = transform04(token03, &lookup_result);
EXPECT_EQ(lookup_result.Shape(), ms_expected03.Shape());
EXPECT_TRUE(status04.IsOk());
}

/// Feature: CharNGram
/// Description: test invalid parameter of ToVectors
/// Expectation: throw exception correctly
TEST_F(MindDataTestExecute, TestToVectorsWithInvalidParamForCharNGram) {
MS_LOG(INFO) << "Doing MindDataTestExecute-TestToVectorsWithInvalidParamForCharNGram.";
std::shared_ptr<Tensor> de_tensor;
Tensor::CreateScalar<std::string>("none", &de_tensor);
auto token = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_tensor));
mindspore::MSTensor lookup_result;

// Transform params.
std::string vectors_dir = "data/dataset/testVectors/char_n_gram_20.txt";
std::shared_ptr<CharNGram> char_n_gram01;
Status s = CharNGram::BuildFromFile(&char_n_gram01, vectors_dir);
EXPECT_EQ(s, Status::OK());
std::vector<float> unknown_init(4, -1);
std::shared_ptr<TensorTransform> to_vectors01 = std::make_shared<text::ToVectors>(char_n_gram01, unknown_init);
auto transform01 = Execute({to_vectors01});
Status status01 = transform01(token, &lookup_result);
EXPECT_FALSE(status01.IsOk());
std::shared_ptr<CharNGram> char_n_gram02 = nullptr;
std::shared_ptr<TensorTransform> to_vectors02 = std::make_shared<text::ToVectors>(char_n_gram02);
auto transform02 = Execute({to_vectors02});
Status status02 = transform02(token, &lookup_result);
EXPECT_FALSE(status02.IsOk());
}

// Feature: DBToAmplitude
// Description: test DBToAmplitude in eager mode
// Expectation: the data is processed successfully


+ 20
- 0
tests/ut/data/dataset/testVectors/char_n_gram_20.txt View File

@@ -0,0 +1,20 @@
1gram-e -0.655379 0.574261 -0.714026 -0.148858 -0.0534275
1gram-a -0.288984 -0.225616 0.323913 -0.261039 -0.0628034
1gram-t 0.408448 0.175862 -0.296873 -0.209094 -0.53478
1gram-i 0.278486 -0.910641 -0.743681 -0.734405 0.519959
1gram-n -0.0712582 0.0898121 -1.12567 -0.815067 -0.435836
1gram-o -0.182786 0.535789 -0.391385 0.181972 0.317399
1gram-r 0.68474 0.103464 0.201631 -0.65319 0.554142
1gram-s -0.175988 -0.813322 0.465603 -0.0951031 0.193374
1gram-h -0.39348 -0.678079 0.233101 0.431805 2.04905
1gram-l -0.451299 -0.268223 -0.787034 -0.991984 0.251244
1gram-d 0.799629 -0.326191 -0.474959 0.235657 0.796227
2gram-e#END# -2.26956 0.288491 -0.740001 0.661703 0.147355
1gram-c -0.0413309 0.436135 -0.835305 -1.64429 -1.08329
2gram-s#END# 0.657201 2.11761 -1.59276 0.432072 1.21395
1gram-u -0.25203 -0.176365 -0.263038 -0.995372 -1.24916
2gram-#BEGIN#t -0.96853 -0.789463 0.515762 2.02107 -1.64635
1gram-m 0.422293 -0.149725 -0.734202 1.27342 0.232722
2gram-he -0.785562 0.63378 -1.23667 -0.693956 0.395988
2gram-th 0.663336 -0.240809 -1.87298 0.364651 0.26296
2gram-n#END# -0.149612 -0.664577 -1.12344 2.23695 0.610406

+ 20
- 0
tests/ut/data/dataset/testVectors/char_n_gram_20_dim_different.txt View File

@@ -0,0 +1,20 @@
1gram-e -0.655379 0.574261 -0.714026 -0.148858 -0.0534275
1gram-a -0.288984 -0.225616 0.323913 -0.261039 -0.0628034
1gram-t 0.408448 0.175862 -0.296873 -0.209094 -0.53478
1gram-i 0.278486 -0.910641 -0.743681 -0.734405 0.519959
1gram-n -0.0712582 0.0898121 -1.12567 -0.815067 -0.435836
1gram-o -0.182786 0.535789 -0.391385 0.181972 0.317399
1gram-r 0.68474 0.103464 0.201631 -0.65319 0.554142
1gram-s -0.175988 -0.813322 0.465603 -0.0951031 0.193374
1gram-h -0.39348 -0.678079 0.233101 0.431805 2.04905
1gram-l -0.451299 -0.268223 -0.787034 -0.991984 0.251244
1gram-d 0.799629 -0.326191 -0.474959 0.235657 0.796227
2gram-e#END# -2.26956 0.288491 -0.740001 0.661703 0.147355
1gram-c -0.0413309 0.436135 -0.835305 -1.64429 -1.08329
2gram-s#END# 0.657201 2.11761 -1.59276 0.432072 1.21395
1gram-u -0.25203 -0.176365 -0.263038 -0.995372 -1.24916
2gram-#BEGIN#t -0.96853 -0.789463 0.515762 2.02107
1gram-m 0.422293 -0.149725 -0.734202 1.27342 0.232722
2gram-he -0.785562 0.63378 -1.23667 -0.693956 0.395988
2gram-th 0.663336 -0.240809 -1.87298 0.364651 0.26296
2gram-n#END# -0.149612 -0.664577 -1.12344 2.23695 0.610406

+ 7
- 7
tests/ut/data/dataset/testVectors/words_with_big_letter.txt View File

@@ -1,7 +1,7 @@
ok
!
This
iS
my
HOME
.
ok
!
This
iS
my
HOME
.

+ 217
- 0
tests/ut/python/dataset/test_char_n_gram.py View File

@@ -0,0 +1,217 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import numpy as np
import pytest
from mindspore import log
import mindspore.dataset as ds
import mindspore.dataset.text as text
import mindspore.dataset.text.transforms as T
DATASET_ROOT_PATH = "../data/dataset/testVectors/"
def _count_unequal_element(data_expected, data_me, rtol, atol):
assert data_expected.shape == data_me.shape
total_count = len(data_expected.flatten())
error = np.abs(data_expected - data_me)
greater = np.greater(error, atol + np.abs(data_expected)*rtol)
loss_count = np.count_nonzero(greater)
assert (loss_count/total_count) < rtol,\
"\ndata_expected_std:{0}\ndata_me_error:{1}\nloss:{2}".\
format(data_expected[greater], data_me[greater], error[greater])
def allclose_nparray(data_expected, data_me, rtol, atol, equal_nan=True):
if np.any(np.isnan(data_expected)):
assert np.allclose(data_me, data_expected, rtol, atol, equal_nan=equal_nan)
elif not np.allclose(data_me, data_expected, rtol, atol, equal_nan=equal_nan):
_count_unequal_element(data_expected, data_me, rtol, atol)
else:
assert True
def test_char_n_gram_all_to_vectors_params_eager():
"""
Feature: CharNGram
Description: test with all parameters which include `unk_init`
and `lower_case_backup` in function ToVectors in eager mode
Expectation: output is equal to the expected value
"""
char_n_gram = text.CharNGram.from_file(DATASET_ROOT_PATH + "char_n_gram_20.txt", max_vectors=18)
unk_init = (-np.ones(5)).tolist()
to_vectors = T.ToVectors(char_n_gram, unk_init=unk_init, lower_case_backup=True)
result1 = to_vectors("THE")
result2 = to_vectors(".")
result3 = to_vectors("To")
res = [[-1.34121733e+00, 4.42693333e-02, -4.86969667e-01, 6.62939000e-01, -3.67669000e-01],
[-1.00000000e+00, -1.00000000e+00, -1.00000000e+00, -1.00000000e+00, -1.00000000e+00],
[-9.68530000e-01, -7.89463000e-01, 5.15762000e-01, 2.02107000e+00, -1.64635000e+00]]
res_array = np.array(res, dtype=np.float32)
allclose_nparray(res_array[0], result1, 0.0001, 0.0001)
allclose_nparray(res_array[1], result2, 0.0001, 0.0001)
allclose_nparray(res_array[2], result3, 0.0001, 0.0001)
def test_char_n_gram_build_from_file():
"""
Feature: CharNGram
Description: test with only default parameter
Expectation: output is equal to the expected value
"""
char_n_gram = text.CharNGram.from_file(DATASET_ROOT_PATH + "char_n_gram_20.txt")
to_vectors = text.ToVectors(char_n_gram)
data = ds.TextFileDataset(DATASET_ROOT_PATH + "words.txt", shuffle=False)
data = data.map(operations=to_vectors, input_columns=["text"])
ind = 0
res = [[0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0.],
[0.117336, 0.362446, -0.983326, 0.939264, -0.05648],
[0.657201, 2.11761, -1.59276, 0.432072, 1.21395],
[0., 0., 0., 0., 0.],
[-2.26956, 0.288491, -0.740001, 0.661703, 0.147355],
[0., 0., 0., 0., 0.]]
for d in data.create_dict_iterator(num_epochs=1, output_numpy=True):
res_array = np.array(res[ind], dtype=np.float32)
allclose_nparray(res_array, d["text"], 0.0001, 0.0001)
ind += 1
def test_char_n_gram_all_build_from_file_params():
"""
Feature: CharNGram
Description: test with all parameters which include `path` and `max_vector` in function BuildFromFile
Expectation: output is equal to the expected value
"""
char_n_gram = text.CharNGram.from_file(DATASET_ROOT_PATH + "char_n_gram_20.txt", max_vectors=100)
to_vectors = text.ToVectors(char_n_gram)
data = ds.TextFileDataset(DATASET_ROOT_PATH + "words.txt", shuffle=False)
data = data.map(operations=to_vectors, input_columns=["text"])
ind = 0
res = [[0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0.],
[0.117336, 0.362446, -0.983326, 0.939264, -0.05648],
[0.657201, 2.11761, -1.59276, 0.432072, 1.21395],
[0., 0., 0., 0., 0.],
[-2.26956, 0.288491, -0.740001, 0.661703, 0.147355],
[0., 0., 0., 0., 0.]]
for d in data.create_dict_iterator(num_epochs=1, output_numpy=True):
res_array = np.array(res[ind], dtype=np.float32)
allclose_nparray(res_array, d["text"], 0.0001, 0.0001)
ind += 1
def test_char_n_gram_all_build_from_file_params_eager():
"""
Feature: CharNGram
Description: test with all parameters which include `path` and `max_vector` in function BuildFromFile in eager mode
Expectation: output is equal to the expected value
"""
char_n_gram = text.CharNGram.from_file(DATASET_ROOT_PATH + "char_n_gram_20.txt", max_vectors=18)
to_vectors = T.ToVectors(char_n_gram)
result1 = to_vectors("the")
result2 = to_vectors(".")
result3 = to_vectors("to")
res = [[-1.34121733e+00, 4.42693333e-02, -4.86969667e-01, 6.62939000e-01, -3.67669000e-01],
[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
[-9.68530000e-01, -7.89463000e-01, 5.15762000e-01, 2.02107000e+00, -1.64635000e+00]]
res_array = np.array(res, dtype=np.float32)
allclose_nparray(res_array[0], result1, 0.0001, 0.0001)
allclose_nparray(res_array[1], result2, 0.0001, 0.0001)
allclose_nparray(res_array[2], result3, 0.0001, 0.0001)
def test_char_n_gram_build_from_file_eager():
"""
Feature: CharNGram
Description: test with only default parameter in eager mode
Expectation: output is equal to the expected value
"""
char_n_gram = text.CharNGram.from_file(DATASET_ROOT_PATH + "char_n_gram_20.txt")
to_vectors = T.ToVectors(char_n_gram)
result1 = to_vectors("the")
result2 = to_vectors(".")
result3 = to_vectors("to")
res = [[-8.40079000e-01, -2.70002500e-02, -8.33472250e-01, 5.88367000e-01, -2.10011750e-01],
[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
[-9.68530000e-01, -7.89463000e-01, 5.15762000e-01, 2.02107000e+00, -1.64635000e+00]]
res_array = np.array(res, dtype=np.float32)
allclose_nparray(res_array[0], result1, 0.0001, 0.0001)
allclose_nparray(res_array[1], result2, 0.0001, 0.0001)
allclose_nparray(res_array[2], result3, 0.0001, 0.0001)
def test_char_n_gram_invalid_input():
"""
Feature: CharNGram
Description: test the validate function with invalid parameters.
Expectation: Verification of correct error message for invalid input.
"""
def test_invalid_input(test_name, file_path, error, error_msg, max_vectors=None,
unk_init=None, lower_case_backup=False, token="ok"):
log.info("Test CharNGram with wrong input: {0}".format(test_name))
with pytest.raises(error) as error_info:
char_n_gram = text.CharNGram.from_file(file_path, max_vectors=max_vectors)
to_vectors = T.ToVectors(char_n_gram, unk_init=unk_init, lower_case_backup=lower_case_backup)
to_vectors(token)
assert error_msg in str(error_info.value)
test_invalid_input("Not all vectors have the same number of dimensions",
DATASET_ROOT_PATH + "char_n_gram_20_dim_different.txt", error=RuntimeError,
error_msg="all vectors must have the same number of dimensions, " +
"but got dim 4 while expecting 5")
test_invalid_input("the file is empty.", DATASET_ROOT_PATH + "vectors_empty.txt",
error=RuntimeError, error_msg="invalid file, file is empty.")
test_invalid_input("the count of `unknown_init`'s element is different with word vector.",
DATASET_ROOT_PATH + "char_n_gram_20.txt",
error=RuntimeError, error_msg="unk_init must be the same length as vectors, " +
"but got unk_init: 6 and vectors: 5", unk_init=np.ones(6).tolist())
test_invalid_input("The file not exist", DATASET_ROOT_PATH + "not_exist.txt", RuntimeError,
error_msg="get real path failed")
test_invalid_input("max_vectors parameter must be greater than 0",
DATASET_ROOT_PATH + "char_n_gram_20.txt", error=ValueError,
error_msg="Input max_vectors is not within the required interval", max_vectors=-1)
test_invalid_input("invalid max_vectors parameter type as a float",
DATASET_ROOT_PATH + "char_n_gram_20.txt", error=TypeError,
error_msg="Argument max_vectors with value 1.0 is not of type [<class 'int'>],"
" but got <class 'float'>.", max_vectors=1.0)
test_invalid_input("invalid max_vectors parameter type as a string",
DATASET_ROOT_PATH + "char_n_gram_20.txt", error=TypeError,
error_msg="Argument max_vectors with value 1 is not of type [<class 'int'>],"
" but got <class 'str'>.", max_vectors="1")
test_invalid_input("invalid token parameter type as a float",
DATASET_ROOT_PATH + "char_n_gram_20.txt", error=RuntimeError,
error_msg="input tensor type should be string.", token=1.0)
test_invalid_input("invalid lower_case_backup parameter type as a string", DATASET_ROOT_PATH + "char_n_gram_20.txt",
error=TypeError, error_msg="Argument lower_case_backup with " +
"value True is not of type [<class 'bool'>],"
" but got <class 'str'>.", lower_case_backup="True")
test_invalid_input("invalid lower_case_backup parameter type as a string", DATASET_ROOT_PATH + "char_n_gram_20.txt",
error=TypeError, error_msg="Argument lower_case_backup with " +
"value True is not of type [<class 'bool'>],"
" but got <class 'str'>.", lower_case_backup="True")
if __name__ == '__main__':
test_char_n_gram_all_to_vectors_params_eager()
test_char_n_gram_build_from_file()
test_char_n_gram_all_build_from_file_params()
test_char_n_gram_all_build_from_file_params_eager()
test_char_n_gram_build_from_file_eager()
test_char_n_gram_invalid_input()

+ 0
- 2
tests/ut/python/dataset/test_fast_text.py View File

@@ -42,7 +42,6 @@ def test_fast_text_all_build_from_file_params():
[0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246],
[0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923],
[0, 0, 0, 0, 0, 0]]
print(data)
for d in data.create_dict_iterator(num_epochs=1, output_numpy=True):
res_array = np.array(res[ind], dtype=np.float32)
assert np.array_equal(res_array, d["text"]), ind
@@ -135,7 +134,6 @@ def test_fast_text_build_from_file():
[0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246],
[0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923],
[0, 0, 0, 0, 0, 0]]
print(data)
for d in data.create_dict_iterator(num_epochs=1, output_numpy=True):
res_array = np.array(res[ind], dtype=np.float32)
assert np.array_equal(res_array, d["text"]), ind


+ 0
- 2
tests/ut/python/dataset/test_glove.py View File

@@ -42,7 +42,6 @@ def test_glove_all_build_from_file_params():
[0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246],
[0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923],
[0, 0, 0, 0, 0, 0]]
print(data)
for d in data.create_dict_iterator(num_epochs=1, output_numpy=True):
res_array = np.array(res[ind], dtype=np.float32)
assert np.array_equal(res_array, d["text"]), ind
@@ -135,7 +134,6 @@ def test_glove_build_from_file():
[0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246],
[0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923],
[0, 0, 0, 0, 0, 0]]
print(data)
for d in data.create_dict_iterator(num_epochs=1, output_numpy=True):
res_array = np.array(res[ind], dtype=np.float32)
assert np.array_equal(res_array, d["text"]), ind


+ 0
- 1
tests/ut/python/dataset/test_vectors.py View File

@@ -101,7 +101,6 @@ def test_vectors_from_file_all_buildfromfile_params():
[0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246],
[0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923],
[0, 0, 0, 0, 0, 0]]
print(data)
for d in data.create_dict_iterator(num_epochs=1, output_numpy=True):
res_array = np.array(res[ind], dtype=np.float32)
assert np.array_equal(res_array, d["text"]), ind


Loading…
Cancel
Save