# 使用Trainer和Tester快速训练和测试

## 数据读入和处理

In [1]:
from fastNLP.io import SST2Pipe

pipe = SST2Pipe()
databundle = pipe.process_from_file()
vocab = databundle.get_vocab('words')
print(databundle)
print(databundle.get_dataset('train')[0])
print(databundle.get_vocab('words'))



In total 3 datasets:
	test has 1821 instances.
	train has 67349 instances.
	dev has 872 instances.
In total 2 vocabs:
	words has 16292 entries.
	target has 2 entries.

+-----------------------------------+--------+-----------------------------------+---------+
| raw_words                         | target | words                             | seq_len |
+-----------------------------------+--------+-----------------------------------+---------+
| hide new secretions from the p... | 1      | [4110, 97, 12009, 39, 2, 6843,... | 7       |
+-----------------------------------+--------+-----------------------------------+---------+
Vocabulary(['hide', 'new', 'secretions', 'from', 'the']...)


In [2]:
train_data = databundle.get_dataset('train')[:5000]
train_data, test_data = train_data.split(0.015)
dev_data = databundle.get_dataset('dev')
print(len(train_data),len(dev_data),len(test_data))

4925 872 75


In [3]:
train_data.print_field_meta()

+-------------+-----------+--------+-------+---------+
| field_names | raw_words | target | words | seq_len |
+-------------+-----------+--------+-------+---------+
|   is_input  |   False   | False  |  True |   True  |
|  is_target  |   False   |  True  | False |  False  |
| ignore_type |           | False  | False |  False  |
|  pad_value  |           |   0    |   0   |    0    |
+-------------+-----------+--------+-------+---------+


<prettytable.PrettyTable at 0x7f0db03d0640>

In [4]:
from fastNLP import AccuracyMetric
from fastNLP import Const

# metrics=AccuracyMetric() 在本例中与下面这行代码等价
metrics=AccuracyMetric(pred=Const.OUTPUT, target=Const.TARGET)

## DataSetIter初探

In [5]:
from fastNLP import BucketSampler
from fastNLP import DataSetIter

tmp_data = dev_data[:10]
# 定义一个Batch，传入DataSet，规定batch_size和去batch的规则。
# 顺序（Sequential），随机（Random），相似长度组成一个batch（Bucket）
sampler = BucketSampler(batch_size=2, seq_len_field_name='seq_len')
batch = DataSetIter(batch_size=2, dataset=tmp_data, sampler=sampler)
for batch_x, batch_y in batch:
    print("batch_x: ",batch_x)
    print("batch_y: ", batch_y)

batch_x:  {'words': tensor([[   13,   830,  7746,   174,     3,    47,     6,    83,  5752,    15,
          2177,    15,    63,    57,   406,    84,  1009,  4973,    27,    17,
         13785,     3,   533,  3687, 15623,    39,   375,     8, 15624,     8,
          1323,  4398,     7],
        [ 1045, 11113,    16,   104,     5,     4,   176,  1824,  1704,     3,
             2,    18,    11,     4,  1018,   432,   143,    33,   245,   308,
             7,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0]]), 'seq_len': tensor([33, 21])}
batch_y:  {'target': tensor([1, 0])}
batch_x:  {'words': tensor([[  14,   10,    4,  311,    5,  154, 1418,  609,    7],
        [  14,   10,  437,   32,   78,    3,   78,  437,    7]]), 'seq_len': tensor([9, 9])}
batch_y:  {'target': tensor([0, 1])}
batch_x:  {'words': tensor([[    4,   277,   685,    18,     7],
        [15618,  3204,     5,  1675,     0]]), 'seq_len': tensor([5, 4])}
batch_y:  {'target': te

In [6]:
tmp_data.set_pad_val('words',-1)
batch = DataSetIter(batch_size=2, dataset=tmp_data, sampler=sampler)
for batch_x, batch_y in batch:
    print("batch_x: ",batch_x)
    print("batch_y: ", batch_y)

batch_x:  {'words': tensor([[   13,   830,  7746,   174,     3,    47,     6,    83,  5752,    15,
          2177,    15,    63,    57,   406,    84,  1009,  4973,    27,    17,
         13785,     3,   533,  3687, 15623,    39,   375,     8, 15624,     8,
          1323,  4398,     7],
        [ 1045, 11113,    16,   104,     5,     4,   176,  1824,  1704,     3,
             2,    18,    11,     4,  1018,   432,   143,    33,   245,   308,
             7,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
            -1,    -1,    -1]]), 'seq_len': tensor([33, 21])}
batch_y:  {'target': tensor([1, 0])}
batch_x:  {'words': tensor([[  14,   10,    4,  311,    5,  154, 1418,  609,    7],
        [  14,   10,  437,   32,   78,    3,   78,  437,    7]]), 'seq_len': tensor([9, 9])}
batch_y:  {'target': tensor([0, 1])}
batch_x:  {'words': tensor([[    2,   155,     3,  4426,     3,   239,     3,   739,     5,  1136,
            41,    43,  2427,   736,     2,   648,    10, 15620

In [7]:
from fastNLP.core.field import Padder
import numpy as np
class FixLengthPadder(Padder):
    def __init__(self, pad_val=0, length=None):
        super().__init__(pad_val=pad_val)
        self.length = length
        assert self.length is not None, "Creating FixLengthPadder with no specific length!"

    def __call__(self, contents, field_name, field_ele_dtype, dim):
        #计算当前contents中的最大长度
        max_len = max(map(len, contents))
        #如果当前contents中的最大长度大于指定的padder length的话就报错
        assert max_len <= self.length, "Fixed padder length smaller than actual length! with length {}".format(max_len)
        array = np.full((len(contents), self.length), self.pad_val, dtype=field_ele_dtype)
        for i, content_i in enumerate(contents):
            array[i, :len(content_i)] = content_i
        return array

#设定FixLengthPadder的固定长度为40
tmp_padder = FixLengthPadder(pad_val=0,length=40)
#利用dataset的set_padder函数设定words field的padder
tmp_data.set_padder('words',tmp_padder)
batch = DataSetIter(batch_size=2, dataset=tmp_data, sampler=sampler)
for batch_x, batch_y in batch:
    print("batch_x: ",batch_x)
    print("batch_y: ", batch_y)

batch_x:  {'words': tensor([[   45,   752,   327,   180,    10, 15621,    16,    72,  8904,     9,
          1217,     7,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  879,    96,     8,  1026,    12,  8067,    11, 13623,     8, 15619,
             4,   673,   662,    15,     4,  1154,   240,   639,   417,     7,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]]), 'seq_len': tensor([12, 20])}
batch_y:  {'target': tensor([1, 0])}
batch_x:  {'words': tensor([[   13,   830,  7746,   174,     3,    47,     6,    83,  5752,    15,
          2177,    15,    63,    57,   406,    84,  1009,  4973,    27,    17,
         13785,     3,   533,  3687, 15623,    39,   375,     8, 15624,     8,
          1323,  4398

## 使用DataSetIter自己编写训练过程


In [8]:
from fastNLP import BucketSampler
from fastNLP import DataSetIter
from fastNLP.models import CNNText
from fastNLP import Tester
import torch
import time

embed_dim = 100
model = CNNText((len(vocab),embed_dim), num_classes=2, dropout=0.1)

def train(epoch, data, devdata):
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    lossfunc = torch.nn.CrossEntropyLoss()
    batch_size = 32

    # 定义一个Batch，传入DataSet，规定batch_size和去batch的规则。
    # 顺序（Sequential），随机（Random），相似长度组成一个batch（Bucket）
    train_sampler = BucketSampler(batch_size=batch_size, seq_len_field_name='seq_len')
    train_batch = DataSetIter(batch_size=batch_size, dataset=data, sampler=train_sampler)

    start_time = time.time()
    print("-"*5+"start training"+"-"*5)
    for i in range(epoch):
        loss_list = []
        for batch_x, batch_y in train_batch:
            optimizer.zero_grad()
            output = model(batch_x['words'])
            loss = lossfunc(output['pred'], batch_y['target'])
            loss.backward()
            optimizer.step()
            loss_list.append(loss.item())

        #这里verbose如果为0，在调用Tester对象的test()函数时不输出任何信息，返回评估信息; 如果为1，打印出验证结果，返回评估信息
        #在调用过Tester对象的test()函数后，调用其_format_eval_results(res)函数，结构化输出验证结果
        tester_tmp = Tester(devdata, model, metrics=AccuracyMetric(), verbose=0)
        res=tester_tmp.test()

        print('Epoch {:d} Avg Loss: {:.2f}'.format(i, sum(loss_list) / len(loss_list)),end=" ")
        print(tester_tmp._format_eval_results(res),end=" ")
        print('{:d}ms'.format(round((time.time()-start_time)*1000)))
        loss_list.clear()

train(10, train_data, dev_data)
#使用tester进行快速测试
tester = Tester(test_data, model, metrics=AccuracyMetric())
tester.test()

-----start training-----


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…

Evaluate data in 2.68 seconds!
Epoch 0 Avg Loss: 0.66 AccuracyMetric: acc=0.708716 29307ms


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…

Evaluate data in 0.38 seconds!
Epoch 1 Avg Loss: 0.41 AccuracyMetric: acc=0.770642 52200ms


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…

Evaluate data in 0.51 seconds!
Epoch 2 Avg Loss: 0.16 AccuracyMetric: acc=0.747706 70268ms


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…

Evaluate data in 0.96 seconds!
Epoch 3 Avg Loss: 0.06 AccuracyMetric: acc=0.741972 90349ms


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…

Evaluate data in 1.04 seconds!
Epoch 4 Avg Loss: 0.03 AccuracyMetric: acc=0.740826 114250ms


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…

Evaluate data in 0.8 seconds!
Epoch 5 Avg Loss: 0.02 AccuracyMetric: acc=0.738532 134742ms


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…

Evaluate data in 0.65 seconds!
Epoch 6 Avg Loss: 0.01 AccuracyMetric: acc=0.731651 154503ms


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…

Evaluate data in 0.8 seconds!
Epoch 7 Avg Loss: 0.01 AccuracyMetric: acc=0.738532 175397ms


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…

Evaluate data in 0.36 seconds!
Epoch 8 Avg Loss: 0.01 AccuracyMetric: acc=0.733945 192384ms


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…

Evaluate data in 0.84 seconds!
Epoch 9 Avg Loss: 0.01 AccuracyMetric: acc=0.744266 214417ms


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=5.0), HTML(value='')), layout=Layout(disp…

Evaluate data in 0.04 seconds!
[tester] 
AccuracyMetric: acc=0.786667


{'AccuracyMetric': {'acc': 0.786667}}