/** * Copyright 2021 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_VECTORS_H_ #define MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_VECTORS_H_ #include #include #include #include #include #include #include #include #include "minddata/dataset/core/tensor.h" #include "minddata/dataset/include/dataset/iterator.h" namespace mindspore { namespace dataset { /// \brief Pre-train word vectors. class Vectors { public: /// Constructor. Vectors() = default; /// Constructor. /// \param[in] map A map between string and vector. /// \param[in] dim Dimension of the vectors. Vectors(const std::unordered_map> &map, int dim); /// Destructor. virtual ~Vectors() = default; /// \brief Build Vectors from reading a pre-train vector file. /// \param[out] vectors Vectors object which contains the pre-train vectors. /// \param[in] path Path to the pre-trained word vector file. /// \param[in] max_vectors This can be used to limit the number of pre-trained vectors loaded (default=0, no limit). static Status BuildFromFile(std::shared_ptr *vectors, const std::string &path, int32_t max_vectors = 0); /// \brief Look up embedding vectors of token. /// \param[in] token A token to be looked up. /// \param[in] unk_init In case of the token is out-of-vectors (OOV), the result will be initialized with `unk_init`. /// (default={}, means to initialize with zero vectors). /// \param[in] lower_case_backup Whether to look up the token in the lower case (Default = false). /// \return The vector of the input token. virtual std::vector Lookup(const std::string &token, const std::vector &unk_init = {}, bool lower_case_backup = false); /// \brief Getter of dimension. const int &Dim() const { return dim_; } protected: /// \brief Infer the shape of the pre-trained word vector file. /// \param[in] path Path to the pre-trained word vector file. /// \param[in] max_vectors Maximum number of pre-trained word vectors to be read. /// \param[out] num_lines The number of lines of the file. /// \param[out] header_num_lines The number of lines of file header. /// \param[out] vector_dim The dimension of the vectors in the file. static Status InferShape(const std::string &path, int32_t max_vectors, int32_t *num_lines, int32_t *header_num_lines, int32_t *vector_dim); /// \brief Load map from reading a pre-train vector file. /// \param[in] path Path to the pre-trained word vector file. /// \param[in] max_vectors This can be used to limit the number of pre-trained vectors loaded, must be non negative. /// \param[out] map The map between words and vectors. /// \param[out] vector_dim The dimension of the vectors in the file. static Status Load(const std::string &path, int32_t max_vectors, std::unordered_map> *map, int *vector_dim); int dim_; std::unordered_map> map_; }; } // namespace dataset } // namespace mindspore #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_VECTORS_H_