You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

text.h 4.0 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107
  1. /**
  2. * Copyright 2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_TEXT_H_
  17. #define MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_TEXT_H_
  18. #include <memory>
  19. #include <string>
  20. #include <vector>
  21. #include "mindspore/ccsrc/minddata/dataset/core/data_type.h"
  22. #include "minddata/dataset/core/constants.h"
  23. #include "minddata/dataset/include/transforms.h"
  24. #include "minddata/dataset/util/status.h"
  25. #include "minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h"
  26. #include "minddata/dataset/text/sentence_piece_vocab.h"
  27. #include "minddata/dataset/text/vocab.h"
  28. namespace mindspore {
  29. namespace dataset {
  30. // Transform operations for text
  31. namespace text {
  32. // Text Op classes (in alphabetical order)
  33. class LookupOperation;
  34. class SentencePieceTokenizerOperation;
  35. /// \brief Lookup operator that looks up a word to an id.
  36. /// \param[in] vocab a Vocab object.
  37. /// \param[in] unknown_token word to use for lookup if the word being looked up is out of Vocabulary (oov).
  38. /// If unknown_token is oov, runtime error will be thrown.
  39. /// \param[in] DataType type of the tensor after lookup, typically int32.
  40. /// \return Shared pointer to the current TensorOperation.
  41. std::shared_ptr<LookupOperation> Lookup(const std::shared_ptr<Vocab> &vocab, const std::string &unknown_token,
  42. const mindspore::dataset::DataType &data_type = DataType("int32"));
  43. /// \brief Tokenize scalar token or 1-D tokens to tokens by sentencepiece.
  44. /// \param[in] vocab a SentencePieceVocab object.
  45. /// \param[in] out_type The type of output.
  46. /// \return Shared pointer to the current TensorOperation.
  47. std::shared_ptr<SentencePieceTokenizerOperation> SentencePieceTokenizer(
  48. const std::shared_ptr<SentencePieceVocab> &vocab, mindspore::dataset::SPieceTokenizerOutType out_type);
  49. /// \brief Tokenize scalar token or 1-D tokens to tokens by sentencepiece.
  50. /// \param[in] vocab_path vocab model file path.
  51. /// \param[in] out_type The type of output.
  52. /// \return Shared pointer to the current TensorOperation.
  53. std::shared_ptr<SentencePieceTokenizerOperation> SentencePieceTokenizer(
  54. const std::string &vocab_path, mindspore::dataset::SPieceTokenizerOutType out_type);
  55. /* ####################################### Derived TensorOperation classes ################################# */
  56. class LookupOperation : public TensorOperation {
  57. public:
  58. explicit LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::string &unknown_token,
  59. const DataType &data_type);
  60. ~LookupOperation() = default;
  61. std::shared_ptr<TensorOp> Build() override;
  62. Status ValidateParams() override;
  63. private:
  64. std::shared_ptr<Vocab> vocab_;
  65. std::string unknown_token_;
  66. int32_t default_id_;
  67. DataType data_type_;
  68. };
  69. class SentencePieceTokenizerOperation : public TensorOperation {
  70. public:
  71. SentencePieceTokenizerOperation(const std::shared_ptr<SentencePieceVocab> &vocab, SPieceTokenizerOutType out_type);
  72. SentencePieceTokenizerOperation(const std::string &vocab_path, SPieceTokenizerOutType out_type);
  73. ~SentencePieceTokenizerOperation() = default;
  74. std::shared_ptr<TensorOp> Build() override;
  75. Status ValidateParams() override;
  76. private:
  77. std::shared_ptr<SentencePieceVocab> vocab_;
  78. std::string vocab_path_;
  79. SPieceTokenizerLoadType load_type_;
  80. SPieceTokenizerOutType out_type_;
  81. };
  82. } // namespace text
  83. } // namespace dataset
  84. } // namespace mindspore
  85. #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_TEXT_H_