In [1]:
# 假设有以下的DataSet, 这里只是为了举例所以只选择了两个sample
import sys
import os
sys.path.append('/Users/yh/Desktop/fastNLP/fastNLP')

from fastNLP import DataSet
from fastNLP import Instance
from fastNLP import Vocabulary

dataset = DataSet()
dataset.append(Instance(raw_sent='This is a bad idea .', label=0))
dataset.append(Instance(raw_sent='It is great .', label=1))

# 按照fastNLP_10min_tutorial.ipynb的步骤，对数据进行一些处理。这里为了演示padding操作，把field的名称做了一些改变
dataset.apply(lambda x:x['raw_sent'].lower(), new_field_name='raw_sent')
dataset.apply(lambda x:x['raw_sent'].split(), new_field_name='word_str_lst')

# 建立Vocabulary
word_vocab = Vocabulary()
dataset.apply(lambda x:word_vocab.update(x['word_str_lst']))
dataset.apply(lambda x:[word_vocab.to_index(word) for word in x['word_str_lst']], new_field_name='words')

# 检查以下是否得到我们想要的结果了
dataset[:2]



DataSet({'raw_sent': this is a bad idea . type=str,
'label': 0 type=int,
'word_str_lst': ['this', 'is', 'a', 'bad', 'idea', '.'] type=list,
'words': [4, 2, 5, 6, 7, 3] type=list},
{'raw_sent': it is great . type=str,
'label': 1 type=int,
'word_str_lst': ['it', 'is', 'great', '.'] type=list,
'words': [8, 2, 9, 3] type=list})

In [2]:
# 将field设置为input或者target
dataset.set_input('word_str_lst')
dataset.set_input('words')
dataset.set_target('label')

# 使用Batch取出batch数据
from fastNLP.core.batch import Batch
from fastNLP.core.sampler import RandomSampler

batch_iterator = Batch(dataset=dataset, batch_size=2, sampler=RandomSampler())
for batch_x, batch_y in batch_iterator:
    print("batch_x has: ", batch_x)
    print("batch_y has: ", batch_y)
""""
结果中
    Batch会对元素类型(元素即最内层的数据，raw_sent为str，word_str_lst为str，words为int, label为int)为int或者float的数据进行默认
        padding，而非int或float的则不进行padding。但若每个Instance中该field为二维数据，也不进行padding。因为二维数据的padding涉及到
        两个维度的padding，不容易自动判断padding的形式。
"""

batch_x has:  {'word_str_lst': array([list(['this', 'is', 'a', 'bad', 'idea', '.']),
       list(['it', 'is', 'great', '.'])], dtype=object), 'words': tensor([[4, 2, 5, 6, 7, 3],
        [8, 2, 9, 3, 0, 0]])}
batch_y has:  {'label': tensor([0, 1])}


'"\n结果中\n    Batch会对元素类型(元素即最内层的数据，raw_sent为str，word_str_lst为str，words为int, label为int)为int或者float的数据进行默认\n        padding，而非int或float的则不进行padding。但若每个Instance中该field为二维数据，也不进行padding。因为二维数据的padding涉及到\n        两个维度的padding，不容易自动判断padding的形式。\n'

In [3]:
# 所有的pad_val都默认为0，如果需要修改某一个field的默认pad值，可以通过DataSet.set_pad_val(field_name, pad_val)进行修改
#    若需要将word的padding修改为-100
dataset.set_pad_val('words', pad_val=-100)
batch_iterator = Batch(dataset=dataset, batch_size=2, sampler=RandomSampler())
for batch_x, batch_y in batch_iterator:
    print("batch_x has: ", batch_x)
    print("batch_y has: ", batch_y)
# pad的值修改为-100了

batch_x has:  {'word_str_lst': array([list(['it', 'is', 'great', '.']),
       list(['this', 'is', 'a', 'bad', 'idea', '.'])], dtype=object), 'words': tensor([[   8,    2,    9,    3, -100, -100],
        [   4,    2,    5,    6,    7,    3]])}
batch_y has:  {'label': tensor([1, 0])}


In [4]:
# 若需要使用二维padding或指定padding方式，可以通过设置该field的padder实现，下面以英文的character padding为例。在某些场景下，可能想要
#    使用英文word的character作为特征，character的padding为二维padding，fastNLP默认只会进行一维padding。

dataset.apply(lambda x: [[c for c in word] for word in x['word_str_lst']], new_field_name='char_str_lst')
char_vocab = Vocabulary()
dataset.apply(lambda x:[char_vocab.update(chars) for chars in x['char_str_lst']])
dataset.apply(lambda x:[[char_vocab.to_index(c) for c in chars] for chars in x['char_str_lst']],new_field_name='chars')
dataset[:2]

DataSet({'raw_sent': this is a bad idea . type=str,
'label': 0 type=int,
'word_str_lst': ['this', 'is', 'a', 'bad', 'idea', '.'] type=list,
'words': [4, 2, 5, 6, 7, 3] type=list,
'char_str_lst': [['t', 'h', 'i', 's'], ['i', 's'], ['a'], ['b', 'a', 'd'], ['i', 'd', 'e', 'a'], ['.']] type=list,
'chars': [[4, 9, 2, 5], [2, 5], [3], [10, 3, 6], [2, 6, 7, 3], [8]] type=list},
{'raw_sent': it is great . type=str,
'label': 1 type=int,
'word_str_lst': ['it', 'is', 'great', '.'] type=list,
'words': [8, 2, 9, 3] type=list,
'char_str_lst': [['i', 't'], ['i', 's'], ['g', 'r', 'e', 'a', 't'], ['.']] type=list,
'chars': [[2, 4], [2, 5], [11, 12, 7, 3, 4], [8]] type=list})

In [5]:
# 如果不针对二维的character指定padding方法
dataset.set_input('chars')
batch_iterator = Batch(dataset=dataset, batch_size=2, sampler=RandomSampler())
for batch_x, batch_y in batch_iterator:
    print("batch_x has: ", batch_x)
    print("batch_y has: ", batch_y)
    
"""
    其它field与之前的是相同的。chars因为存在两个维度需要padding，不能自动决定padding方式，所以直接输出了原始形式。
"""

batch_x has:  {'word_str_lst': array([list(['this', 'is', 'a', 'bad', 'idea', '.']),
       list(['it', 'is', 'great', '.'])], dtype=object), 'words': tensor([[   4,    2,    5,    6,    7,    3],
        [   8,    2,    9,    3, -100, -100]]), 'chars': array([list([[4, 9, 2, 5], [2, 5], [3], [10, 3, 6], [2, 6, 7, 3], [8]]),
       list([[2, 4], [2, 5], [11, 12, 7, 3, 4], [8]])], dtype=object)}
batch_y has:  {'label': tensor([0, 1])}


'\n    其它field与之前的是相同的。chars因为存在两个维度需要padding，不能自动决定padding方式，所以直接输出了原始形式。\n'

In [6]:
# 若要使用二维padding，需要手动设置padding方式
from fastNLP.core.fieldarray import EngChar2DPadder
dataset.set_padder('chars', EngChar2DPadder())
batch_iterator = Batch(dataset=dataset, batch_size=2, sampler=RandomSampler())
for batch_x, batch_y in batch_iterator:
    print("batch_x has: ", batch_x)
    print("batch_y has: ", batch_y)
    
"""
    chars被正确padding了
"""

batch_x has:  {'word_str_lst': array([list(['this', 'is', 'a', 'bad', 'idea', '.']),
       list(['it', 'is', 'great', '.'])], dtype=object), 'words': tensor([[   4,    2,    5,    6,    7,    3],
        [   8,    2,    9,    3, -100, -100]]), 'chars': tensor([[[ 4,  9,  2,  5],
         [ 2,  5,  0,  0],
         [ 3,  0,  0,  0],
         [10,  3,  6,  0],
         [ 2,  6,  7,  3],
         [ 8,  0,  0,  0]],

        [[ 2,  4,  0,  0],
         [ 2,  5,  0,  0],
         [11, 12,  7,  3],
         [ 8,  0,  0,  0],
         [ 0,  0,  0,  0],
         [ 0,  0,  0,  0]]])}
batch_y has:  {'label': tensor([0, 1])}


'\n    chars被正确padding了\n'

In [7]:
# 如果AutoPad与EngChar2DPadder不能满足需要，可以自己实现Padder对象。这里举一个例子，比如需要把raw_sentence pad到一样长
from fastNLP.core.fieldarray import PadderBase

class PadStr(PadderBase):
    def __init__(self, pad_val=' '):
        super().__init__(pad_val=pad_val) #让父类管理pad_val的值，这样可以通过DataSet.set_pad_val()修改到该值
        
    def __call__(self, contents, field_name, field_ele_dtype):
        """
        如果以上面的例子举例，在raw_sent这个field进行pad时，传入的
        contents:
            [
                'This is a bad idea .',
                'It is great .'
            ]
        field_name: 'raw_sent'，当前field的名称，主要用于帮助debug。
        field_ele_dtype: np.str. 这个参数基本都用不上，是该field中内部元素的类型
        """
        max_len = max([len(str_) for str_ in contents])
        pad_strs = []
        for content in contents:
            pad_strs.append(content + (max_len-len(content))*self.pad_val)
        return pad_strs

dataset.set_input('raw_sent')
dataset.set_padder('raw_sent', PadStr())
batch_iterator = Batch(dataset=dataset, batch_size=2, sampler=RandomSampler())
for batch_x, batch_y in batch_iterator:
    print("batch_x has: ", batch_x)
    print("batch_y has: ", batch_y)

"""
    raw_sent正确输出，对应内容也进行了pad。
"""

batch_x has:  {'raw_sent': ['this is a bad idea .', 'it is great .       '], 'word_str_lst': array([list(['this', 'is', 'a', 'bad', 'idea', '.']),
       list(['it', 'is', 'great', '.'])], dtype=object), 'words': tensor([[   4,    2,    5,    6,    7,    3],
        [   8,    2,    9,    3, -100, -100]]), 'chars': tensor([[[ 4,  9,  2,  5],
         [ 2,  5,  0,  0],
         [ 3,  0,  0,  0],
         [10,  3,  6,  0],
         [ 2,  6,  7,  3],
         [ 8,  0,  0,  0]],

        [[ 2,  4,  0,  0],
         [ 2,  5,  0,  0],
         [11, 12,  7,  3],
         [ 8,  0,  0,  0],
         [ 0,  0,  0,  0],
         [ 0,  0,  0,  0]]])}
batch_y has:  {'label': tensor([0, 1])}


'\n    raw_sent正确输出，对应内容也进行了pad。\n'