|
|
@@ -12,7 +12,9 @@ |
|
|
# See the License for the specific language governing permissions and |
|
|
# See the License for the specific language governing permissions and |
|
|
# limitations under the License. |
|
|
# limitations under the License. |
|
|
""" |
|
|
""" |
|
|
Some basic function for text |
|
|
|
|
|
|
|
|
The module text.utils provides some general methods for nlp text processing. |
|
|
|
|
|
For example, you can use Vocab to build a dictionary, |
|
|
|
|
|
use to_bytes and to_str to encode and decode strings into a specified format. |
|
|
""" |
|
|
""" |
|
|
from enum import IntEnum |
|
|
from enum import IntEnum |
|
|
|
|
|
|
|
|
@@ -52,12 +54,12 @@ class Vocab(cde.Vocab): |
|
|
min_frequency/max_frequency can be None, which corresponds to 0/total_words separately |
|
|
min_frequency/max_frequency can be None, which corresponds to 0/total_words separately |
|
|
(default=None, all words are included). |
|
|
(default=None, all words are included). |
|
|
top_k(int, optional): top_k > 0. Number of words to be built into vocab. top_k most frequent words are |
|
|
top_k(int, optional): top_k > 0. Number of words to be built into vocab. top_k most frequent words are |
|
|
taken. top_k is taken after freq_range. If not enough top_k, all words will be taken. (default=None, |
|
|
|
|
|
|
|
|
taken. top_k is taken after freq_range. If not enough top_k, all words will be taken (default=None, |
|
|
all words are included). |
|
|
all words are included). |
|
|
special_tokens(list, optional): a list of strings, each one is a special token. for example |
|
|
special_tokens(list, optional): a list of strings, each one is a special token. for example |
|
|
special_tokens=["<pad>","<unk>"] (default=None, no special tokens will be added). |
|
|
special_tokens=["<pad>","<unk>"] (default=None, no special tokens will be added). |
|
|
special_first(bool, optional): whether special_tokens will be prepended/appended to vocab. If special_tokens |
|
|
special_first(bool, optional): whether special_tokens will be prepended/appended to vocab. If special_tokens |
|
|
is specified and special_first is set to None, special_tokens will be prepended. (default=None). |
|
|
|
|
|
|
|
|
is specified and special_first is set to None, special_tokens will be prepended (default=None). |
|
|
|
|
|
|
|
|
Returns: |
|
|
Returns: |
|
|
Vocab, Vocab object built from dataset. |
|
|
Vocab, Vocab object built from dataset. |
|
|
@@ -81,7 +83,7 @@ class Vocab(cde.Vocab): |
|
|
special_tokens(list, optional): a list of strings, each one is a special token. for example |
|
|
special_tokens(list, optional): a list of strings, each one is a special token. for example |
|
|
special_tokens=["<pad>","<unk>"] (default=None, no special tokens will be added). |
|
|
special_tokens=["<pad>","<unk>"] (default=None, no special tokens will be added). |
|
|
special_first(bool, optional): whether special_tokens will be prepended/appended to vocab, If special_tokens |
|
|
special_first(bool, optional): whether special_tokens will be prepended/appended to vocab, If special_tokens |
|
|
is specified and special_first is set to None, special_tokens will be prepended. (default=None). |
|
|
|
|
|
|
|
|
is specified and special_first is set to None, special_tokens will be prepended (default=None). |
|
|
""" |
|
|
""" |
|
|
|
|
|
|
|
|
return super().from_list(word_list, special_tokens, special_first) |
|
|
return super().from_list(word_list, special_tokens, special_first) |
|
|
@@ -101,7 +103,7 @@ class Vocab(cde.Vocab): |
|
|
special_tokens=["<pad>","<unk>"] (default=None, no special tokens will be added). |
|
|
special_tokens=["<pad>","<unk>"] (default=None, no special tokens will be added). |
|
|
special_first (bool, optional): whether special_tokens will be prepended/appended to vocab, |
|
|
special_first (bool, optional): whether special_tokens will be prepended/appended to vocab, |
|
|
If special_tokens is specified and special_first is set to None, |
|
|
If special_tokens is specified and special_first is set to None, |
|
|
special_tokens will be prepended. (default=None). |
|
|
|
|
|
|
|
|
special_tokens will be prepended (default=None). |
|
|
""" |
|
|
""" |
|
|
|
|
|
|
|
|
return super().from_file(file_path, delimiter, vocab_size, special_tokens, special_first) |
|
|
return super().from_file(file_path, delimiter, vocab_size, special_tokens, special_first) |
|
|
@@ -157,12 +159,14 @@ def to_bytes(array, encoding='utf8'): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class JiebaMode(IntEnum): |
|
|
class JiebaMode(IntEnum): |
|
|
|
|
|
"""An enumeration for JiebaTokenizer, effective enumeration types are MIX, MP, HMM.""" |
|
|
MIX = 0 |
|
|
MIX = 0 |
|
|
MP = 1 |
|
|
MP = 1 |
|
|
HMM = 2 |
|
|
HMM = 2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class NormalizeForm(IntEnum): |
|
|
class NormalizeForm(IntEnum): |
|
|
|
|
|
"""An enumeration for NormalizeUTF8, effective enumeration types are NONE, NFC, NFKC, NFD, NFKD.""" |
|
|
NONE = 0 |
|
|
NONE = 0 |
|
|
NFC = 1 |
|
|
NFC = 1 |
|
|
NFKC = 2 |
|
|
NFKC = 2 |
|
|
|