Huawei_Technology
/
mindspore

 
			
							# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Some basic function for text
"""
from enum import IntEnum

import copy
import numpy as np
import mindspore._c_dataengine as cde

from .validators import check_from_file, check_from_list, check_from_dict, check_from_dataset


class Vocab(cde.Vocab):
    """
        Vocab object that is used for lookup word.
    """

    @classmethod
    @check_from_dataset
    def from_dataset(cls, dataset, columns=None, freq_range=None, top_k=None):
        """
        Build a vocab from a dataset. This would collect all unique words in a dataset and return a vocab within
        the frequency range specified by user in freq_range. User would be warned if no words fall into the frequency.
        Words in vocab are ordered from highest frequency to lowest frequency. Words with the same frequency would be
        ordered lexicographically.

        Args:
            dataset(Dataset): dataset to build vocab from.
            columns([str, list], optional): column names to get words from. It can be a list of column names.
                (Default=None where all columns will be used. If any column isn't string type, will return error)
            freq_range(tuple, optional): A tuple of integers (min_frequency, max_frequency). Words within the frequency
                range would be kept. 0 <= min_frequency <= max_frequency <= total_words. min_frequency=0 is the same as
                min_frequency=1. max_frequency > total_words is the same as max_frequency = total_words.
                min_frequency/max_frequency can be None, which corresponds to 0/total_words separately
                (default=None, all words are included).
            top_k(int, optional): top_k > 0. Number of words to be built into vocab. top_k most frequent words are
                taken. top_k is taken after freq_range. If not enough top_k, all words will be taken. (default=None
                all words are included).
        return:
            text.Vocab: Vocab object built from dataset.
        """
        vocab = Vocab()
        root = copy.deepcopy(dataset).build_vocab(vocab, columns, freq_range, top_k)
        for d in root.create_dict_iterator():
            if d is not None:
                raise ValueError("from_dataset should receive data other than None.")
        return vocab

    @classmethod
    @check_from_list
    def from_list(cls, word_list):
        """
            build a vocab object from a list of word.
        Args:
            word_list(list): a list of string where each element is a word.
        """
        return super().from_list(word_list)

    @classmethod
    @check_from_file
    def from_file(cls, file_path, delimiter=None, vocab_size=None):
        """
            build a vocab object from a list of word.
        Args:
            file_path(str): path to the file which contains the vocab list.
            delimiter(str, optional): a delimiter to break up each line in file, the first element is taken to be
                the word (default=None).
            vocab_size(int, optional): number of words to read from file_path (default=None, all words are taken).
        """
        return super().from_file(file_path, delimiter, vocab_size)

    @classmethod
    @check_from_dict
    def from_dict(cls, word_dict):
        """
            build a vocab object from a dict.
        Args:
            word_dict(dict): dict contains word, id pairs. id should start from 2 and be continuous.
        """
        return super().from_dict(word_dict)


def to_str(array, encoding='utf8'):
    """
    Convert numpy array of `bytes` to array of `str` by decoding each element based on charset `encoding`.

    Args:
        array (numpy array): Array of type `bytes` representing strings.
        encoding (string): Indicating the charset for decoding.
    Returns:
        Numpy array of `str`.

    """

    if not isinstance(array, np.ndarray):
        raise ValueError('input should be a numpy array.')

    return np.char.decode(array, encoding)


def to_bytes(array, encoding='utf8'):
    """
    Convert numpy array of `str` to array of `bytes` by encoding each element based on charset `encoding`.

    Args:
        array (numpy array): Array of type `str` representing strings.
        encoding (string): Indicating the charset for encoding.
    Returns:
        Numpy array of `bytes`.

    """

    if not isinstance(array, np.ndarray):
        raise ValueError('input should be a numpy array.')

    return np.char.encode(array, encoding)


class JiebaMode(IntEnum):
    MIX = 0
    MP = 1
    HMM = 2


class NormalizeForm(IntEnum):
    NONE = 0
    NFC = 1
    NFKC = 2
    NFD = 3
    NFKD = 4