|
|
|
@@ -417,6 +417,9 @@ if platform.system().lower() != 'windows': |
|
|
|
""" |
|
|
|
Tokenize a scalar tensor of UTF-8 string on ICU4C defined whitespaces, such as: ' ', '\\\\t', '\\\\r', '\\\\n'. |
|
|
|
|
|
|
|
Note: |
|
|
|
The WhitespaceTokenizer is not supported on windows platform yet. |
|
|
|
|
|
|
|
Args: |
|
|
|
with_offsets (bool, optional): If or not output offsets of tokens (default=False). |
|
|
|
|
|
|
|
@@ -445,6 +448,9 @@ if platform.system().lower() != 'windows': |
|
|
|
""" |
|
|
|
Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries. |
|
|
|
|
|
|
|
Note: |
|
|
|
The UnicodeScriptTokenizer is not supported on windows platform yet. |
|
|
|
|
|
|
|
Args: |
|
|
|
keep_whitespace (bool, optional): If or not emit whitespace tokens (default=False). |
|
|
|
with_offsets (bool, optional): If or not output offsets of tokens (default=False). |
|
|
|
@@ -475,6 +481,9 @@ if platform.system().lower() != 'windows': |
|
|
|
""" |
|
|
|
Apply case fold operation on utf-8 string tensor. |
|
|
|
|
|
|
|
Note: |
|
|
|
The CaseFold is not supported on windows platform yet. |
|
|
|
|
|
|
|
Examples: |
|
|
|
>>> import mindspore.dataset.text as text |
|
|
|
>>> |
|
|
|
@@ -495,6 +504,9 @@ if platform.system().lower() != 'windows': |
|
|
|
""" |
|
|
|
Apply normalize operation on utf-8 string tensor. |
|
|
|
|
|
|
|
Note: |
|
|
|
The NormalizeUTF8 is not supported on windows platform yet. |
|
|
|
|
|
|
|
Args: |
|
|
|
normalize_form (NormalizeForm, optional): Valid values can be any of [NormalizeForm.NONE, |
|
|
|
NormalizeForm.NFC, NormalizeForm.NFKC, NormalizeForm.NFD, |
|
|
|
@@ -528,6 +540,9 @@ if platform.system().lower() != 'windows': |
|
|
|
|
|
|
|
See http://userguide.icu-project.org/strings/regexp for support regex pattern. |
|
|
|
|
|
|
|
Note: |
|
|
|
The RegexReplace is not supported on windows platform yet. |
|
|
|
|
|
|
|
Args: |
|
|
|
pattern (str): the regex expression patterns. |
|
|
|
replace (str): the string to replace matched element. |
|
|
|
@@ -556,6 +571,9 @@ if platform.system().lower() != 'windows': |
|
|
|
|
|
|
|
See http://userguide.icu-project.org/strings/regexp for support regex pattern. |
|
|
|
|
|
|
|
Note: |
|
|
|
The RegexTokenizer is not supported on windows platform yet. |
|
|
|
|
|
|
|
Args: |
|
|
|
delim_pattern (str): The pattern of regex delimiters. |
|
|
|
The original string will be split by matched elements. |
|
|
|
@@ -591,6 +609,9 @@ if platform.system().lower() != 'windows': |
|
|
|
""" |
|
|
|
Tokenize a scalar tensor of UTF-8 string by specific rules. |
|
|
|
|
|
|
|
Note: |
|
|
|
The BasicTokenizer is not supported on windows platform yet. |
|
|
|
|
|
|
|
Args: |
|
|
|
lower_case (bool, optional): If True, apply CaseFold, NormalizeUTF8(NFD mode), RegexReplace operation |
|
|
|
on input text to fold the text to lower case and strip accents characters. If False, only apply |
|
|
|
@@ -644,6 +665,9 @@ if platform.system().lower() != 'windows': |
|
|
|
""" |
|
|
|
Tokenizer used for Bert text process. |
|
|
|
|
|
|
|
Note: |
|
|
|
The BertTokenizer is not supported on windows platform yet. |
|
|
|
|
|
|
|
Args: |
|
|
|
vocab (Vocab): A vocabulary object. |
|
|
|
suffix_indicator (str, optional): Used to show that the subword is the last part of a word (default='##'). |
|
|
|
|