|
|
|
@@ -51,7 +51,8 @@ from .utils import JiebaMode, NormalizeForm, to_str, SPieceTokenizerOutType, SPi |
|
|
|
from .validators import check_lookup, check_jieba_add_dict, \ |
|
|
|
check_jieba_add_word, check_jieba_init, check_with_offsets, check_unicode_script_tokenizer, \ |
|
|
|
check_wordpiece_tokenizer, check_regex_replace, check_regex_tokenizer, check_basic_tokenizer, check_ngram, \ |
|
|
|
check_pair_truncate, check_to_number, check_bert_tokenizer, check_python_tokenizer, check_slidingwindow |
|
|
|
check_pair_truncate, check_to_number, check_bert_tokenizer, check_python_tokenizer, check_slidingwindow, \ |
|
|
|
check_sentence_piece_tokenizer |
|
|
|
from ..core.datatypes import mstype_to_detype |
|
|
|
from ..core.validator_helpers import replace_none |
|
|
|
from ..transforms.c_transforms import TensorOperation |
|
|
|
@@ -325,7 +326,7 @@ class SentencePieceTokenizer(TextTensorOperation): |
|
|
|
Args: |
|
|
|
mode (Union[str, SentencePieceVocab]): If the input parameter is a file, then it is of type string. |
|
|
|
If the input parameter is a SentencePieceVocab object, then it is of type SentencePieceVocab. |
|
|
|
out_type (Union[str, int]): The type of output. |
|
|
|
out_type (SPieceTokenizerOutType): The type of output, the type is int or string |
|
|
|
|
|
|
|
Examples: |
|
|
|
>>> from mindspore.dataset.text import SentencePieceModel, SPieceTokenizerOutType |
|
|
|
@@ -335,7 +336,7 @@ class SentencePieceTokenizer(TextTensorOperation): |
|
|
|
>>> tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING) |
|
|
|
>>> text_file_dataset = text_file_dataset.map(operations=tokenizer) |
|
|
|
""" |
|
|
|
|
|
|
|
@check_sentence_piece_tokenizer |
|
|
|
def __init__(self, mode, out_type): |
|
|
|
self.mode = mode |
|
|
|
self.out_type = out_type |
|
|
|
|