# Copyright 2020 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ c transforms for all text related operators """ import os import re import platform import mindspore._c_dataengine as cde from .utils import JiebaMode, NormalizeForm from .validators import check_lookup, check_jieba_add_dict, \ check_jieba_add_word, check_jieba_init, check_ngram, check_pair_truncate class Lookup(cde.LookupOp): """ Lookup operator that looks up a word to an id Args: vocab(Vocab): a Vocab object unknown(None,int): default id to lookup a word that is out of vocab """ @check_lookup def __init__(self, vocab, unknown=None): if unknown is None: super().__init__(vocab) else: super().__init__(vocab, unknown) class Ngram(cde.NgramOp): """ TensorOp to generate n-gram from a 1-D string Tensor Refer to https://en.wikipedia.org/wiki/N-gram#Examples for an explanation of what n-gram is. Args: n(int or list): n in n-gram, n >= 1. n is a list of positive integers, for e.g. n=[4,3], The result would be a 4-gram followed by a 3-gram in the same tensor. left_pad(tuple, optional): ("pad_token",pad_width). Padding performed on left side of the sequence. pad_width will be capped at n-1. left_pad=("_",2) would pad left side of the sequence with "__". (Default is None) right_pad(tuple, optional): ("pad_token",pad_width). Padding performed on right side of the sequence. pad_width will be capped at n-1. right_pad=("-":2) would pad right side of the sequence with "--". (Default is None) separator(str,optional): symbol used to join strings together. for e.g. if 2-gram the ["mindspore", "amazing"] with separator="-" the result would be ["mindspore-amazing"]. (Default is None which means whitespace is used) """ @check_ngram def __init__(self, n, left_pad=None, right_pad=None, separator=None): super().__init__(ngrams=n, l_pad_len=left_pad[1], r_pad_len=right_pad[1], l_pad_token=left_pad[0], r_pad_token=right_pad[0], separator=separator) DE_C_INTER_JIEBA_MODE = { JiebaMode.MIX: cde.JiebaMode.DE_JIEBA_MIX, JiebaMode.MP: cde.JiebaMode.DE_JIEBA_MP, JiebaMode.HMM: cde.JiebaMode.DE_JIEBA_HMM } class JiebaTokenizer(cde.JiebaTokenizerOp): """ Tokenize Chinese string into words based on dictionary. Args: hmm_path (str): the dictionary file is used by HMMSegment algorithm, the dictionary can be obtained on the official website of cppjieba. mp_path(str): the dictionary file is used by MPSegment algorithm, the dictionary can be obtained on the official website of cppjieba. mode (Enum): [Default "MIX"], "MP" model will tokenize with MPSegment algorithm, "HMM" mode will tokenize with Hiddel Markov Model Segment algorithm, "MIX" model will tokenize with a mix of MPSegment and HMMSegment algorithm. """ @check_jieba_init def __init__(self, hmm_path, mp_path, mode=JiebaMode.MIX): self.mode = mode self.__check_path__(hmm_path) self.__check_path__(mp_path) super().__init__(hmm_path, mp_path, DE_C_INTER_JIEBA_MODE[mode]) @check_jieba_add_word def add_word(self, word, freq=None): """ Add user defined word to JiebaTokenizer's dictionary Args: word(required, string): The word to be added to the JiebaTokenizer instance. The added word will not be written into the built-in dictionary on disk. freq(optional, int): The frequency of the word to be added, The higher the frequency, the better change the word will be tokenized(default None, use default frequency). """ if freq is None: super().add_word(word, 0) else: super().add_word(word, freq) @check_jieba_add_dict def add_dict(self, user_dict): """ Add user defined word to JiebaTokenizer's dictionary Args: user_dict(path/dict):Dictionary to be added, file path or Python dictionary, Python Dict format: {word1:freq1, word2:freq2,...} Jieba dictionary format : word(required), freq(optional), such as: word1 freq1 word2 word3 freq3 """ if isinstance(user_dict, str): self.__add_dict_py_file(user_dict) elif isinstance(user_dict, dict): for k, v in user_dict.items(): self.add_word(k, v) else: raise ValueError("the type of user_dict must str or dict") def __add_dict_py_file(self, file_path): """Add user defined word by file""" words_list = self.__parser_file(file_path) for data in words_list: if data[1] is None: freq = 0 else: freq = int(data[1]) self.add_word(data[0], freq) def __parser_file(self, file_path): """parser user defined word by file""" if not os.path.exists(file_path): raise ValueError( "user dict file {} is not exist".format(file_path)) file_dict = open(file_path) data_re = re.compile('^(.+?)( [0-9]+)?$', re.U) words_list = [] for item in file_dict: data = item.strip() if not isinstance(data, str): data = self.__decode(data) words = data_re.match(data).groups() if len(words) != 2: raise ValueError( "user dict file {} format error".format(file_path)) words_list.append(words) return words_list def __decode(self, data): """decode the dict file to utf8""" try: data = data.decode('utf-8') except UnicodeDecodeError: raise ValueError("user dict file must utf8") return data.lstrip('\ufeff') def __check_path__(self, model_path): """check model path""" if not os.path.exists(model_path): raise ValueError( " jieba mode file {} is not exist".format(model_path)) class UnicodeCharTokenizer(cde.UnicodeCharTokenizerOp): """ Tokenize a scalar tensor of UTF-8 string to Unicode characters. """ class WordpieceTokenizer(cde.WordpieceTokenizerOp): """ Tokenize scalar token or 1-D tokens to subword tokens. Args vocab(Vocab): a Vocab object. suffix_indicator(string, optional): Used to show that the subword is the last part of a word(default '##'). max_bytes_per_token(int, optional): Tokens exceeding this length will not be further split(default 100). unknown_token(string, optional): When we can not found the token: if 'unknown_token' is empty string, return the token directly, else return 'unknown_token'(default '[UNK]'). """ def __init__(self, vocab, suffix_indicator='##', max_bytes_per_token=100, unknown_token='[UNK]'): self.vocab = vocab self.suffix_indicator = suffix_indicator self.max_bytes_per_token = max_bytes_per_token self.unknown_token = unknown_token super().__init__(self.vocab, self.suffix_indicator, self.max_bytes_per_token, self.unknown_token) if platform.system().lower() != 'windows': class WhitespaceTokenizer(cde.WhitespaceTokenizerOp): """ Tokenize a scalar tensor of UTF-8 string on ICU defined whitespaces(such as: ' ', '\t', '\r', '\n'). """ class UnicodeScriptTokenizer(cde.UnicodeScriptTokenizerOp): """ Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries. Args: keep_whitespace(bool, optional): If or not emit whitespace tokens (default False) """ def __init__(self, keep_whitespace=False): self.keep_whitespace = keep_whitespace super().__init__(self.keep_whitespace) class CaseFold(cde.CaseFoldOp): """ Apply case fold operation on utf-8 string tensor. """ DE_C_INTER_NORMALIZE_FORM = { NormalizeForm.NONE: cde.NormalizeForm.DE_NORMALIZE_NONE, NormalizeForm.NFC: cde.NormalizeForm.DE_NORMALIZE_NFC, NormalizeForm.NFKC: cde.NormalizeForm.DE_NORMALIZE_NFKC, NormalizeForm.NFD: cde.NormalizeForm.DE_NORMALIZE_NFD, NormalizeForm.NFKD: cde.NormalizeForm.DE_NORMALIZE_NFKD } class NormalizeUTF8(cde.NormalizeUTF8Op): """ Apply normalize operation on utf-8 string tensor. Args: normalize_form(Enum, optional): Valid values are "NONE", "NFC", "NFKC", "NFD", "NFKD". If set "NONE", will do nothing for input string tensor. If set to any of "NFC", "NFKC", "NFD", "NFKD", will apply normalize operation(default "NFKC"). See http://unicode.org/reports/tr15/ for details. """ def __init__(self, normalize_form=NormalizeForm.NFKC): self.normalize_form = DE_C_INTER_NORMALIZE_FORM[normalize_form] super().__init__(self.normalize_form) class RegexReplace(cde.RegexReplaceOp): """ Replace utf-8 string tensor with 'replace' according to regular expression 'pattern'. See http://userguide.icu-project.org/strings/regexp for support regex pattern. Args: pattern(string): the regex expression patterns. replace(string): the string to replace matched element. replace_all(bool, optional): If False, only replace first matched element; if True, replace all matched elements(default True). """ def __init__(self, pattern, replace, replace_all=True): self.pattern = pattern self.replace = replace self.replace_all = replace_all super().__init__(self.pattern, self.replace, self.replace_all) class RegexTokenizer(cde.RegexTokenizerOp): """ Tokenize a scalar tensor of UTF-8 string by regex expression pattern. See http://userguide.icu-project.org/strings/regexp for support regex pattern. Args: delim_pattern(string): The pattern of regex delimiters. The original string will be split by matched elements. keep_delim_pattern(string, optional): The string matched by 'delim_pattern' can be kept as a token if it can be matched by 'keep_delim_pattern'. And the default value is empty string(''), in this situation, delimiters will not kept as a output token. """ def __init__(self, delim_pattern, keep_delim_pattern=''): self.delim_pattern = delim_pattern self.keep_delim_pattern = keep_delim_pattern super().__init__(self.delim_pattern, self.keep_delim_pattern) class BasicTokenizer(cde.BasicTokenizerOp): """ Tokenize a scalar tensor of UTF-8 string by specific rules. Args: lower_case(bool, optional): If True, apply CaseFold, NormalizeUTF8(NFD mode), RegexReplace operation on input text to make the text to lower case and strip accents characters; If False, only apply NormalizeUTF8('normalization_form' mode) operation on input text(default False). keep_whitespace(bool, optional), If True, the whitespace will be kept in out tokens(default False). normalization_form(Enum, optional), Used to specify a specific normlaize mode, only effective when 'lower_case' is False. See NormalizeUTF8 for details(default 'NONE'). preserve_unused_token(bool, optional), If True, do not split special tokens like '[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]'(default True). """ def __init__(self, lower_case=False, keep_whitespace=False, normalization_form=NormalizeForm.NONE, preserve_unused_token=True): self.lower_case = lower_case self.keep_whitespace = keep_whitespace self.normalization_form = DE_C_INTER_NORMALIZE_FORM[normalization_form] self.preserve_unused_token = preserve_unused_token super().__init__(self.lower_case, self.keep_whitespace, self.normalization_form, self.preserve_unused_token) class BertTokenizer(cde.BertTokenizerOp): """ Tokenizer used for Bert text process. Args: vocab(Vocab): a Vocab object. suffix_indicator(string, optional): Used to show that the subword is the last part of a word(default '##'). max_bytes_per_token(int, optional): Tokens exceeding this length will not be further split(default 100). unknown_token(string, optional): When we can not found the token: if 'unknown_token' is empty string, return the token directly, else return 'unknown_token'(default '[UNK]'). lower_case(bool, optional): If True, apply CaseFold, NormalizeUTF8(NFD mode), RegexReplace operation on input text to make the text to lower case and strip accents characters; If False, only apply NormalizeUTF8('normalization_form' mode) operation on input text(default False). keep_whitespace(bool, optional), If True, the whitespace will be kept in out tokens(default False). normalization_form(Enum, optional), Used to specify a specific normlaize mode, only effective when 'lower_case' is False. See NormalizeUTF8 for details(default 'NONE'). preserve_unused_token(bool, optional), If True, do not split special tokens like '[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]'(default True). """ def __init__(self, vocab, suffix_indicator='##', max_bytes_per_token=100, unknown_token='[UNK]', lower_case=False, keep_whitespace=False, normalization_form=NormalizeForm.NONE, preserve_unused_token=True): self.vocab = vocab self.suffix_indicator = suffix_indicator self.max_bytes_per_token = max_bytes_per_token self.unknown_token = unknown_token self.lower_case = lower_case self.keep_whitespace = keep_whitespace self.normalization_form = DE_C_INTER_NORMALIZE_FORM[normalization_form] self.preserve_unused_token = preserve_unused_token super().__init__(self.vocab, self.suffix_indicator, self.max_bytes_per_token, self.unknown_token, self.lower_case, self.keep_whitespace, self.normalization_form, self.preserve_unused_token) class TruncateSequencePair(cde.TruncateSequencePairOp): """ Truncate a pair of rank-1 tensors such that the total length is less than max_length. This operation takes two input tensors and returns two output Tenors. Args: max_length(int): Maximum length required. Examples: >>> # Data before >>> # | col1 | col2 | >>> # +---------+---------| >>> # | [1,2,3] | [4,5] | >>> # +---------+---------+ >>> data = data.map(operations=TruncateSequencePair(4)) >>> # Data after >>> # | col1 | col2 | >>> # +---------+---------+ >>> # | [1,2] | [4,5] | >>> # +---------+---------+ """ @check_pair_truncate def __init__(self, max_length): super().__init__(max_length)