hummingbird
/
fastNLP

r"""undocumented"""

__all__ = [
    "Conll2003NERPipe",
    "Conll2003Pipe",
    "OntoNotesNERPipe",
    "MsraNERPipe",
    "PeopleDailyPipe",
    "WeiboNERPipe"
]

from .pipe import Pipe
from .utils import _add_chars_field
from .utils import _indexize, _add_words_field
from .utils import iob2, iob2bioes
from fastNLP.io.data_bundle import DataBundle
from ..loader.conll import Conll2003NERLoader, OntoNotesNERLoader
from ..loader.conll import PeopleDailyNERLoader, WeiboNERLoader, MsraNERLoader, ConllLoader
# from ...core.const import Const
from ...core.vocabulary import Vocabulary


class _NERPipe(Pipe):
    r"""
    NER任务的处理Pipe, 该Pipe会（1）复制raw_words列，并命名为words; (2）在words, target列建立词表
    (创建 :class:`fastNLP.Vocabulary` 对象，所以在返回的DataBundle中将有两个Vocabulary); (3）将words，target列根据相应的
    Vocabulary转换为index。

    raw_words列为List[str], 是未转换的原始数据; words列为List[int]，是转换为index的输入数据; target列是List[int]，是转换为index的
    target。返回的DataSet中被设置为input有words, target, seq_len; 设置为target有target, seq_len。
    """
    
    def __init__(self, encoding_type: str = 'bio', lower: bool = False):
        r"""

        :param: str encoding_type: target列使用什么类型的encoding方式，支持bioes, bio两种。
        :param bool lower: 是否将words小写化后再建立词表，绝大多数情况都不需要设置为True。
        """
        if encoding_type == 'bio':
            self.convert_tag = iob2
        elif encoding_type == 'bioes':
            self.convert_tag = lambda words: iob2bioes(iob2(words))
        else:
            raise ValueError("encoding_type only supports `bio` and `bioes`.")
        self.lower = lower
    
    def process(self, data_bundle: DataBundle) -> DataBundle:
        r"""
        支持的DataSet的field为

        .. csv-table::
           :header: "raw_words", "target"

           "[Nadim, Ladki]", "[B-PER, I-PER]"
           "[AL-AIN, United, Arab, ...]", "[B-LOC, B-LOC, I-LOC, ...]"
           "[...]", "[...]"

        :param ~fastNLP.DataBundle data_bundle: 传入的DataBundle中的DataSet必须包含raw_words和ner两个field，且两个field的内容均为List[str]在传入DataBundle基础上原位修改。
        :return DataBundle:
        """
        # 转换tag
        for name, dataset in data_bundle.iter_datasets():
            dataset.apply_field(self.convert_tag, field_name='target', new_field_name='target')
        
        _add_words_field(data_bundle, lower=self.lower)
        
        # index
        _indexize(data_bundle)
        
        input_fields = ['target', 'words', 'seq_len']
        target_fields = ['target', 'seq_len']
        
        for name, dataset in data_bundle.iter_datasets():
            dataset.add_seq_len('words')

        return data_bundle


class Conll2003NERPipe(_NERPipe):
    r"""
    Conll2003的NER任务的处理Pipe, 该Pipe会（1）复制raw_words列，并命名为words; (2）在words, target列建立词表
    (创建 :class:`fastNLP.Vocabulary` 对象，所以在返回的DataBundle中将有两个Vocabulary); (3）将words，target列根据相应的
    Vocabulary转换为index。
    经过该Pipe过后，DataSet中的内容如下所示

    .. csv-table:: Following is a demo layout of DataSet returned by Conll2003Loader
       :header: "raw_words", "target", "words", "seq_len"

       "[Nadim, Ladki]", "[1, 2]", "[2, 3]", 2
       "[AL-AIN, United, Arab, ...]", "[3, 4,...]", "[4, 5, 6,...]", 6
       "[...]", "[...]", "[...]", .

    raw_words列为List[str], 是未转换的原始数据; words列为List[int]，是转换为index的输入数据; target列是List[int]，是转换为index的
    target。返回的DataSet中被设置为input有words, target, seq_len; 设置为target有target。

    dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为::

        +-------------+-----------+--------+-------+---------+
        | field_names | raw_words | target | words | seq_len |
        +-------------+-----------+--------+-------+---------+
        |   is_input  |   False   |  True  |  True |   True  |
        |  is_target  |   False   |  True  | False |   True  |
        | ignore_type |           | False  | False |  False  |
        |  pad_value  |           |   0    |   0   |    0    |
        +-------------+-----------+--------+-------+---------+

    """
    
    def process_from_file(self, paths) -> DataBundle:
        r"""

        :param paths: 支持路径类型参见 :class:`fastNLP.io.loader.ConllLoader` 的load函数。
        :return: DataBundle
        """
        # 读取数据
        data_bundle = Conll2003NERLoader().load(paths)
        data_bundle = self.process(data_bundle)
        
        return data_bundle


class Conll2003Pipe(Pipe):
    r"""
    经过该Pipe后，DataSet中的内容如下

    .. csv-table::
       :header: "raw_words" , "pos", "chunk", "ner", "words", "seq_len"

       "[Nadim, Ladki]", "[0, 0]", "[1, 2]", "[1, 2]", "[2, 3]", 2
       "[AL-AIN, United, Arab, ...]", "[1, 2...]", "[3, 4...]", "[3, 4...]", "[4, 5, 6,...]", 6
       "[...]", "[...]", "[...]", "[...]", "[...]", .

    其中words, seq_len是input; pos, chunk, ner, seq_len是target
    dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为::

        +-------------+-----------+-------+-------+-------+-------+---------+
        | field_names | raw_words |  pos  | chunk |  ner  | words | seq_len |
        +-------------+-----------+-------+-------+-------+-------+---------+
        |   is_input  |   False   | False | False | False |  True |   True  |
        |  is_target  |   False   |  True |  True |  True | False |   True  |
        | ignore_type |           | False | False | False | False |  False  |
        |  pad_value  |           |   0   |   0   |   0   |   0   |    0    |
        +-------------+-----------+-------+-------+-------+-------+---------+


    """
    def __init__(self, chunk_encoding_type='bioes', ner_encoding_type='bioes', lower: bool = False):
        r"""

        :param str chunk_encoding_type: 支持bioes, bio。
        :param str ner_encoding_type: 支持bioes, bio。
        :param bool lower: 是否将words列小写化后再建立词表
        """
        if chunk_encoding_type == 'bio':
            self.chunk_convert_tag = iob2
        elif chunk_encoding_type == 'bioes':
            self.chunk_convert_tag = lambda tags: iob2bioes(iob2(tags))
        else:
            raise ValueError("chunk_encoding_type only supports `bio` and `bioes`.")
        if ner_encoding_type == 'bio':
            self.ner_convert_tag = iob2
        elif ner_encoding_type == 'bioes':
            self.ner_convert_tag = lambda tags: iob2bioes(iob2(tags))
        else:
            raise ValueError("ner_encoding_type only supports `bio` and `bioes`.")
        self.lower = lower
    
    def process(self, data_bundle) -> DataBundle:
        r"""
        输入的DataSet应该类似于如下的形式

        .. csv-table::
           :header: "raw_words", "pos", "chunk", "ner"

           "[Nadim, Ladki]", "[NNP, NNP]", "[B-NP, I-NP]", "[B-PER, I-PER]"
           "[AL-AIN, United, Arab, ...]", "[NNP, NNP...]", "[B-NP, B-NP, ...]", "[B-LOC, B-LOC,...]"
           "[...]", "[...]", "[...]", "[...]", .

        :param data_bundle:
        :return: 传入的DataBundle
        """
        # 转换tag
        for name, dataset in data_bundle.datasets.items():
            dataset.drop(lambda x: "-DOCSTART-" in x['raw_words'])
            dataset.apply_field(self.chunk_convert_tag, field_name='chunk', new_field_name='chunk')
            dataset.apply_field(self.ner_convert_tag, field_name='ner', new_field_name='ner')
        
        _add_words_field(data_bundle, lower=self.lower)
        
        # index
        _indexize(data_bundle, input_field_names='words', target_field_names=['pos', 'ner'])
        # chunk中存在一些tag只在dev中出现，没在train中
        tgt_vocab = Vocabulary(unknown=None, padding=None)
        tgt_vocab.from_dataset(*data_bundle.datasets.values(), field_name='chunk')
        tgt_vocab.index_dataset(*data_bundle.datasets.values(), field_name='chunk')
        data_bundle.set_vocab(tgt_vocab, 'chunk')
        
        input_fields = ['words', 'seq_len']
        target_fields = ['pos', 'ner', 'chunk', 'seq_len']
        
        for name, dataset in data_bundle.iter_datasets():
            dataset.add_seq_len('words')

        return data_bundle
    
    def process_from_file(self, paths):
        r"""

        :param paths:
        :return:
        """
        data_bundle = ConllLoader(headers=['raw_words', 'pos', 'chunk', 'ner']).load(paths)
        return self.process(data_bundle)


class OntoNotesNERPipe(_NERPipe):
    r"""
    处理OntoNotes的NER数据，处理之后DataSet中的field情况为

    .. csv-table::
       :header: "raw_words", "target", "words", "seq_len"

       "[Nadim, Ladki]", "[1, 2]", "[2, 3]", 2
       "[AL-AIN, United, Arab, ...]", "[3, 4]", "[4, 5, 6,...]", 6
       "[...]", "[...]", "[...]", .

    raw_words列为List[str], 是未转换的原始数据; words列为List[int]，是转换为index的输入数据; target列是List[int]，是转换为index的
    target。返回的DataSet中被设置为input有words, target, seq_len; 设置为target有target。

    dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为::

        +-------------+-----------+--------+-------+---------+
        | field_names | raw_words | target | words | seq_len |
        +-------------+-----------+--------+-------+---------+
        |   is_input  |   False   |  True  |  True |   True  |
        |  is_target  |   False   |  True  | False |   True  |
        | ignore_type |           | False  | False |  False  |
        |  pad_value  |           |   0    |   0   |    0    |
        +-------------+-----------+--------+-------+---------+

    """
    
    def process_from_file(self, paths):
        data_bundle = OntoNotesNERLoader().load(paths)
        return self.process(data_bundle)


class _CNNERPipe(Pipe):
    r"""
    中文NER任务的处理Pipe, 该Pipe会（1）复制raw_chars列，并命名为chars; (2）在chars, target列建立词表
    (创建 :class:`fastNLP.Vocabulary` 对象，所以在返回的DataBundle中将有两个Vocabulary); (3）将chars，target列根据相应的
    Vocabulary转换为index。

    raw_chars列为List[str], 是未转换的原始数据; chars列为List[int]，是转换为index的输入数据; target列是List[int]，是转换为index的
    target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target, seq_len。

    """
    
    def __init__(self, encoding_type: str = 'bio', bigrams=False, trigrams=False):
        r"""
        
        :param str encoding_type: target列使用什么类型的encoding方式，支持bioes, bio两种。
        :param bool bigrams: 是否增加一列bigrams. bigrams的构成是['复', '旦', '大', '学', ...]->["复旦", "旦大", ...]。如果
            设置为True，返回的DataSet将有一列名为bigrams, 且已经转换为了index并设置为input，对应的vocab可以通过
            data_bundle.get_vocab('bigrams')获取.
        :param bool trigrams: 是否增加一列trigrams. trigrams的构成是 ['复', '旦', '大', '学', ...]->["复旦大", "旦大学", ...]
            。如果设置为True，返回的DataSet将有一列名为trigrams, 且已经转换为了index并设置为input，对应的vocab可以通过
            data_bundle.get_vocab('trigrams')获取.
        """
        if encoding_type == 'bio':
            self.convert_tag = iob2
        elif encoding_type == 'bioes':
            self.convert_tag = lambda words: iob2bioes(iob2(words))
        else:
            raise ValueError("encoding_type only supports `bio` and `bioes`.")

        self.bigrams = bigrams
        self.trigrams = trigrams

    def process(self, data_bundle: DataBundle) -> DataBundle:
        r"""
        支持的DataSet的field为

        .. csv-table::
           :header: "raw_chars", "target"

           "[相, 比, 之, 下,...]", "[O, O, O, O, ...]"
           "[青, 岛, 海, 牛, 队, 和, ...]", "[B-ORG, I-ORG, I-ORG, ...]"
           "[...]", "[...]"

        raw_chars列为List[str], 是未转换的原始数据; chars列为List[int]，是转换为index的输入数据; target列是List[int]，
        是转换为index的target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target。

        :param ~fastNLP.DataBundle data_bundle: 传入的DataBundle中的DataSet必须包含raw_words和ner两个field，且两个field的内容均为List[str]。在传入DataBundle基础上原位修改。
        :return: DataBundle
        """
        # 转换tag
        for name, dataset in data_bundle.datasets.items():
            dataset.apply_field(self.convert_tag, field_name='target', new_field_name='target')
        
        _add_chars_field(data_bundle, lower=False)

        input_field_names = ['chars']
        if self.bigrams:
            for name, dataset in data_bundle.iter_datasets():
                dataset.apply_field(lambda chars: [c1 + c2 for c1, c2 in zip(chars, chars[1:] + ['<eos>'])],
                                    field_name='chars', new_field_name='bigrams')
            input_field_names.append('bigrams')
        if self.trigrams:
            for name, dataset in data_bundle.datasets.items():
                dataset.apply_field(lambda chars: [c1 + c2 + c3 for c1, c2, c3 in
                                                   zip(chars, chars[1:] + ['<eos>'], chars[2:] + ['<eos>'] * 2)],
                                    field_name='chars', new_field_name='trigrams')
            input_field_names.append('trigrams')

        # index
        _indexize(data_bundle, input_field_names, 'target')
        
        input_fields = ['target', 'seq_len'] + input_field_names
        target_fields = ['target', 'seq_len']
        
        for name, dataset in data_bundle.iter_datasets():
            dataset.add_seq_len('chars')

        return data_bundle


class MsraNERPipe(_CNNERPipe):
    r"""
    处理MSRA-NER的数据，处理之后的DataSet的field情况为

    .. csv-table::
       :header: "raw_chars", "target", "chars", "seq_len"

       "[相, 比, 之, 下,...]", "[0, 0, 0, 0, ...]", "[2, 3, 4, 5, ...]", 11
       "[青, 岛, 海, 牛, 队, 和, ...]", "[1, 2, 3, ...]", "[10, 21, ....]", 21
       "[...]", "[...]", "[...]", .

    raw_chars列为List[str], 是未转换的原始数据; chars列为List[int]，是转换为index的输入数据; target列是List[int]，是转换为index的
    target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target。

    dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为::

        +-------------+-----------+--------+-------+---------+
        | field_names | raw_chars | target | chars | seq_len |
        +-------------+-----------+--------+-------+---------+
        |   is_input  |   False   |  True  |  True |   True  |
        |  is_target  |   False   |  True  | False |   True  |
        | ignore_type |           | False  | False |  False  |
        |  pad_value  |           |   0    |   0   |    0    |
        +-------------+-----------+--------+-------+---------+

    """
    
    def process_from_file(self, paths=None) -> DataBundle:
        data_bundle = MsraNERLoader().load(paths)
        return self.process(data_bundle)


class PeopleDailyPipe(_CNNERPipe):
    r"""
    处理people daily的ner的数据，处理之后的DataSet的field情况为

    .. csv-table::
       :header: "raw_chars", "target", "chars", "seq_len"

       "[相, 比, 之, 下,...]", "[0, 0, 0, 0, ...]", "[2, 3, 4, 5, ...]", 11
       "[青, 岛, 海, 牛, 队, 和, ...]", "[1, 2, 3, ...]", "[10, 21, ....]", 21
       "[...]", "[...]", "[...]", .

    raw_chars列为List[str], 是未转换的原始数据; chars列为List[int]，是转换为index的输入数据; target列是List[int]，是转换为index的
    target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target。

    dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为::

        +-------------+-----------+--------+-------+---------+
        | field_names | raw_chars | target | chars | seq_len |
        +-------------+-----------+--------+-------+---------+
        |   is_input  |   False   |  True  |  True |   True  |
        |  is_target  |   False   |  True  | False |   True  |
        | ignore_type |           | False  | False |  False  |
        |  pad_value  |           |   0    |   0   |    0    |
        +-------------+-----------+--------+-------+---------+

    """
    
    def process_from_file(self, paths=None) -> DataBundle:
        data_bundle = PeopleDailyNERLoader().load(paths)
        return self.process(data_bundle)


class WeiboNERPipe(_CNNERPipe):
    r"""
    处理weibo的ner的数据，处理之后的DataSet的field情况为

    .. csv-table::
       :header: "raw_chars", "chars", "target", "seq_len"

       "['老', '百', '姓']", "[4, 3, 3]", "[38, 39, 40]", 3
       "['心']", "[0]", "[41]", 1
       "[...]", "[...]", "[...]", .

    raw_chars列为List[str], 是未转换的原始数据; chars列为List[int]，是转换为index的输入数据; target列是List[int]，是转换为index的
    target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target。

    dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为::

        +-------------+-----------+--------+-------+---------+
        | field_names | raw_chars | target | chars | seq_len |
        +-------------+-----------+--------+-------+---------+
        |   is_input  |   False   |  True  |  True |   True  |
        |  is_target  |   False   |  True  | False |   True  |
        | ignore_type |           | False  | False |  False  |
        |  pad_value  |           |   0    |   0   |    0    |
        +-------------+-----------+--------+-------+---------+

    """
    
    def process_from_file(self, paths=None) -> DataBundle:
        data_bundle = WeiboNERLoader().load(paths)
        return self.process(data_bundle)