yingda.chen 3 years ago
parent
commit
b31c86aa0e
3 changed files with 32 additions and 18 deletions
  1. +17
    -12
      modelscope/pydatasets/py_dataset.py
  2. +8
    -1
      modelscope/utils/constant.py
  3. +7
    -5
      tests/pipelines/test_text_classification.py

+ 17
- 12
modelscope/pydatasets/py_dataset.py View File

@@ -1,9 +1,9 @@
import logging
from typing import (Any, Callable, Dict, List, Mapping, Optional, Sequence, from typing import (Any, Callable, Dict, List, Mapping, Optional, Sequence,
Union) Union)


from datasets import Dataset, load_dataset from datasets import Dataset, load_dataset


from modelscope.utils.constant import Hubs
from modelscope.utils.logger import get_logger from modelscope.utils.logger import get_logger


logger = get_logger() logger = get_logger()
@@ -41,17 +41,17 @@ class PyDataset:
return dataset return dataset


@staticmethod @staticmethod
def load(
path: Union[str, list],
target: Optional[str] = None,
version: Optional[str] = None,
name: Optional[str] = None,
split: Optional[str] = None,
data_dir: Optional[str] = None,
data_files: Optional[Union[str, Sequence[str],
Mapping[str, Union[str,
Sequence[str]]]]] = None
) -> 'PyDataset':
def load(path: Union[str, list],
target: Optional[str] = None,
version: Optional[str] = None,
name: Optional[str] = None,
split: Optional[str] = None,
data_dir: Optional[str] = None,
data_files: Optional[Union[str, Sequence[str],
Mapping[str,
Union[str,
Sequence[str]]]]] = None,
hub: Optional[Hubs] = None) -> 'PyDataset':
"""Load a PyDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset. """Load a PyDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset.
Args: Args:


@@ -62,10 +62,15 @@ class PyDataset:
data_dir (str, optional): Defining the data_dir of the dataset configuration. I data_dir (str, optional): Defining the data_dir of the dataset configuration. I
data_files (str or Sequence or Mapping, optional): Path(s) to source data file(s). data_files (str or Sequence or Mapping, optional): Path(s) to source data file(s).
split (str, optional): Which split of the data to load. split (str, optional): Which split of the data to load.
hub (Hubs, optional): When loading from a remote hub, where it is from


Returns: Returns:
PyDataset (obj:`PyDataset`): PyDataset object for a certain dataset. PyDataset (obj:`PyDataset`): PyDataset object for a certain dataset.
""" """
if Hubs.modelscope == hub:
# TODO: parse data meta information from modelscope hub
# and possibly download data files to local (and update path)
print('getting data from modelscope hub')
if isinstance(path, str): if isinstance(path, str):
dataset = load_dataset( dataset = load_dataset(
path, path,


+ 8
- 1
modelscope/utils/constant.py View File

@@ -57,13 +57,20 @@ class Tasks(object):




class InputFields(object): class InputFields(object):
""" Names for input data fileds in the input data for pipelines
""" Names for input data fields in the input data for pipelines
""" """
img = 'img' img = 'img'
text = 'text' text = 'text'
audio = 'audio' audio = 'audio'




class Hubs(object):
""" Source from which an entity (such as a Dataset or Model) is stored
"""
modelscope = 'modelscope'
huggingface = 'huggingface'


# configuration filename # configuration filename
# in order to avoid conflict with huggingface # in order to avoid conflict with huggingface
# config file we use maas_config instead # config file we use maas_config instead


+ 7
- 5
tests/pipelines/test_text_classification.py View File

@@ -10,7 +10,7 @@ from modelscope.models.nlp import BertForSequenceClassification
from modelscope.pipelines import SequenceClassificationPipeline, pipeline from modelscope.pipelines import SequenceClassificationPipeline, pipeline
from modelscope.preprocessors import SequenceClassificationPreprocessor from modelscope.preprocessors import SequenceClassificationPreprocessor
from modelscope.pydatasets import PyDataset from modelscope.pydatasets import PyDataset
from modelscope.utils.constant import Tasks
from modelscope.utils.constant import Hubs, Tasks
from modelscope.utils.hub import get_model_cache_dir from modelscope.utils.hub import get_model_cache_dir




@@ -81,13 +81,15 @@ class SequenceClassificationTest(unittest.TestCase):
text_classification = pipeline( text_classification = pipeline(
task=Tasks.text_classification, model=self.model_id) task=Tasks.text_classification, model=self.model_id)
result = text_classification( result = text_classification(
PyDataset.load('glue', name='sst2', target='sentence'))
PyDataset.load(
'glue', name='sst2', target='sentence', hub=Hubs.huggingface))
self.printDataset(result) self.printDataset(result)


def test_run_with_default_model(self): def test_run_with_default_model(self):
text_classification = pipeline(task=Tasks.text_classification) text_classification = pipeline(task=Tasks.text_classification)
result = text_classification( result = text_classification(
PyDataset.load('glue', name='sst2', target='sentence'))
PyDataset.load(
'glue', name='sst2', target='sentence', hub=Hubs.huggingface))
self.printDataset(result) self.printDataset(result)


def test_run_with_dataset(self): def test_run_with_dataset(self):
@@ -97,9 +99,9 @@ class SequenceClassificationTest(unittest.TestCase):
text_classification = pipeline( text_classification = pipeline(
Tasks.text_classification, model=model, preprocessor=preprocessor) Tasks.text_classification, model=model, preprocessor=preprocessor)
# loaded from huggingface dataset # loaded from huggingface dataset
# TODO: add load_from parameter (an enum) LOAD_FROM.hugging_face
# TODO: rename parameter as dataset_name and subset_name # TODO: rename parameter as dataset_name and subset_name
dataset = PyDataset.load('glue', name='sst2', target='sentence')
dataset = PyDataset.load(
'glue', name='sst2', target='sentence', hub=Hubs.huggingface)
result = text_classification(dataset) result = text_classification(dataset)
self.printDataset(result) self.printDataset(result)




Loading…
Cancel
Save