hummingbird
/
fastNLP

import ast
from fastNLP import DataSet, Instance, Vocabulary
from fastNLP.core.vocabulary import VocabularyOption
from fastNLP.io import JsonLoader
from fastNLP.io.base_loader import DataInfo
from fastNLP.io.embed_loader import EmbeddingOption
from fastNLP.io.file_reader import _read_json
from typing import Union, Dict
from reproduction.Star_transformer.datasets import EmbedLoader
from reproduction.utils import check_dataloader_paths


class yelpLoader(JsonLoader):
    
    """
    读取Yelp数据集, DataSet包含fields:
    
        review_id: str, 22 character unique review id
        user_id: str, 22 character unique user id
        business_id: str, 22 character business id
        useful: int, number of useful votes received
        funny: int, number of funny votes received
        cool: int, number of cool votes received
        date: str, date formatted YYYY-MM-DD
        words: list(str), 需要分类的文本
        target: str, 文本的标签
    
    数据来源: https://www.yelp.com/dataset/download
    
    :param fine_grained: 是否使用SST-5标准，若 ``False`` , 使用SST-2。Default: ``False``
    """
    
    def __init__(self, fine_grained=False):
        super(yelpLoader, self).__init__()
        tag_v = {'1.0': 'very negative', '2.0': 'negative', '3.0': 'neutral',
            '4.0': 'positive', '5.0': 'very positive'}
        if not fine_grained:
            tag_v['1.0'] = tag_v['2.0']
            tag_v['5.0'] = tag_v['4.0']
        self.fine_grained = fine_grained
        self.tag_v = tag_v
    
    def _load(self, path):
        ds = DataSet()
        for idx, d in _read_json(path, fields=self.fields_list, dropna=self.dropna):
            d = ast.literal_eval(d)
            d["words"] = d.pop("text").split()
            d["target"] = self.tag_v[str(d.pop("stars"))]
            ds.append(Instance(**d))
        return ds

    def process(self, paths: Union[str, Dict[str, str]], vocab_opt: VocabularyOption = None,
                embed_opt: EmbeddingOption = None):
        paths = check_dataloader_paths(paths)
        datasets = {}
        info = DataInfo()
        vocab = Vocabulary(min_freq=2) if vocab_opt is None else Vocabulary(**vocab_opt)
        for name, path in paths.items():
            dataset = self.load(path)
            datasets[name] = dataset
            vocab.from_dataset(dataset, field_name="words")
        info.vocabs = vocab
        info.datasets = datasets
        if embed_opt is not None:
            embed = EmbedLoader.load_with_vocab(**embed_opt, vocab=vocab)
            info.embeddings['words'] = embed
        return info