hummingbird
/
fastNLP

import unittest

import numpy as np
import torch

from fastNLP import Batch
from fastNLP import DataSet
from fastNLP import Instance
from fastNLP import SequentialSampler


def generate_fake_dataset(num_samples=1000):
    """
    产生的DataSet包含以下的field {'1':[], '2':[], '3': [], '4':[]}
    :param num_samples: sample的数量
    :return:
    """
    
    max_len = 50
    min_len = 10
    num_features = 4
    
    data_dict = {}
    for i in range(num_features):
        data = []
        lengths = np.random.randint(min_len, max_len, size=(num_samples))
        for length in lengths:
            data.append(np.random.randint(100, size=length))
        data_dict[str(i)] = data
    
    dataset = DataSet(data_dict)
    
    for i in range(num_features):
        if np.random.randint(2) == 0:
            dataset.set_input(str(i))
        else:
            dataset.set_target(str(i))
    return dataset


def construct_dataset(sentences):
    """Construct a data set from a list of sentences.

    :param sentences: list of list of str
    :return dataset: a DataSet object
    """
    dataset = DataSet()
    for sentence in sentences:
        instance = Instance()
        instance['raw_sentence'] = sentence
        dataset.append(instance)
    return dataset


class TestCase1(unittest.TestCase):
    def test_simple(self):
        dataset = construct_dataset(
            [["FastNLP", "is", "the", "most", "beautiful", "tool", "in", "the", "world"] for _ in range(40)])
        dataset.set_target()
        batch = Batch(dataset, batch_size=4, sampler=SequentialSampler(), as_numpy=True)
        
        cnt = 0
        for _, _ in batch:
            cnt += 1
        self.assertEqual(cnt, 10)
    
    def test_dataset_batching(self):
        ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40})
        ds.set_input("x")
        ds.set_target("y")
        iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=True)
        for x, y in iter:
            self.assertTrue(isinstance(x["x"], np.ndarray) and isinstance(y["y"], np.ndarray))
            self.assertEqual(len(x["x"]), 4)
            self.assertEqual(len(y["y"]), 4)
            self.assertListEqual(list(x["x"][-1]), [1, 2, 3, 4])
            self.assertListEqual(list(y["y"][-1]), [5, 6])
    
    def test_list_padding(self):
        ds = DataSet({"x": [[1], [1, 2], [1, 2, 3], [1, 2, 3, 4]] * 10,
                      "y": [[4, 3, 2, 1], [3, 2, 1], [2, 1], [1]] * 10})
        ds.set_input("x")
        ds.set_target("y")
        iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=True)
        for x, y in iter:
            self.assertEqual(x["x"].shape, (4, 4))
            self.assertEqual(y["y"].shape, (4, 4))
    
    def test_numpy_padding(self):
        ds = DataSet({"x": np.array([[1], [1, 2], [1, 2, 3], [1, 2, 3, 4]] * 10),
                      "y": np.array([[4, 3, 2, 1], [3, 2, 1], [2, 1], [1]] * 10)})
        ds.set_input("x")
        ds.set_target("y")
        iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=True)
        for x, y in iter:
            self.assertEqual(x["x"].shape, (4, 4))
            self.assertEqual(y["y"].shape, (4, 4))
    
    def test_list_to_tensor(self):
        ds = DataSet({"x": [[1], [1, 2], [1, 2, 3], [1, 2, 3, 4]] * 10,
                      "y": [[4, 3, 2, 1], [3, 2, 1], [2, 1], [1]] * 10})
        ds.set_input("x")
        ds.set_target("y")
        iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False)
        for x, y in iter:
            self.assertTrue(isinstance(x["x"], torch.Tensor))
            self.assertEqual(tuple(x["x"].shape), (4, 4))
            self.assertTrue(isinstance(y["y"], torch.Tensor))
            self.assertEqual(tuple(y["y"].shape), (4, 4))
    
    def test_numpy_to_tensor(self):
        ds = DataSet({"x": np.array([[1], [1, 2], [1, 2, 3], [1, 2, 3, 4]] * 10),
                      "y": np.array([[4, 3, 2, 1], [3, 2, 1], [2, 1], [1]] * 10)})
        ds.set_input("x")
        ds.set_target("y")
        iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False)
        for x, y in iter:
            self.assertTrue(isinstance(x["x"], torch.Tensor))
            self.assertEqual(tuple(x["x"].shape), (4, 4))
            self.assertTrue(isinstance(y["y"], torch.Tensor))
            self.assertEqual(tuple(y["y"].shape), (4, 4))
    
    def test_list_of_list_to_tensor(self):
        ds = DataSet([Instance(x=[1, 2], y=[3, 4]) for _ in range(2)] +
                     [Instance(x=[1, 2, 3, 4], y=[3, 4, 5, 6]) for _ in range(2)])
        ds.set_input("x")
        ds.set_target("y")
        iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False)
        for x, y in iter:
            self.assertTrue(isinstance(x["x"], torch.Tensor))
            self.assertEqual(tuple(x["x"].shape), (4, 4))
            self.assertTrue(isinstance(y["y"], torch.Tensor))
            self.assertEqual(tuple(y["y"].shape), (4, 4))
    
    def test_list_of_numpy_to_tensor(self):
        ds = DataSet([Instance(x=np.array([1, 2]), y=np.array([3, 4])) for _ in range(2)] +
                     [Instance(x=np.array([1, 2, 3, 4]), y=np.array([3, 4, 5, 6])) for _ in range(2)])
        ds.set_input("x")
        ds.set_target("y")
        iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False)
        for x, y in iter:
            print(x, y)
    
    def test_sequential_batch(self):
        batch_size = 32
        num_samples = 1000
        dataset = generate_fake_dataset(num_samples)
        
        batch = Batch(dataset, batch_size=batch_size, sampler=SequentialSampler())
        for batch_x, batch_y in batch:
            pass
    
    """
    def test_multi_workers_batch(self):
        batch_size = 32
        pause_seconds = 0.01
        num_samples = 1000
        dataset = generate_fake_dataset(num_samples)

        num_workers = 1
        batch = Batch(dataset, batch_size=batch_size, sampler=SequentialSampler(), num_workers=num_workers)
        for batch_x, batch_y in batch:
            time.sleep(pause_seconds)

        num_workers = 2
        batch = Batch(dataset, batch_size=batch_size, sampler=SequentialSampler(), num_workers=num_workers)
        end1 = time.time()
        for batch_x, batch_y in batch:
            time.sleep(pause_seconds)
    """
    """
    def test_pin_memory(self):
        batch_size = 32
        pause_seconds = 0.01
        num_samples = 1000
        dataset = generate_fake_dataset(num_samples)

        batch = Batch(dataset, batch_size=batch_size, sampler=SequentialSampler(), pin_memory=True)
        # 这里发生OOM
        # for batch_x, batch_y in batch:
        #     time.sleep(pause_seconds)

        num_workers = 2
        batch = Batch(dataset, batch_size=batch_size, sampler=SequentialSampler(), num_workers=num_workers,
                      pin_memory=True)
        # 这里发生OOM
        # for batch_x, batch_y in batch:
        #     time.sleep(pause_seconds)
    """