|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687 |
- import os
- import typing
-
- from d3m import container, utils as d3m_utils
- from d3m.base import utils as base_utils
- from d3m.metadata import base as metadata_base, hyperparams
- from d3m.primitive_interfaces import base, transformer
-
- import common_primitives
-
- __all__ = ('DatasetToDataFramePrimitive',)
-
- Inputs = container.Dataset
- Outputs = container.DataFrame
-
-
- class Hyperparams(hyperparams.Hyperparams):
- dataframe_resource = hyperparams.Hyperparameter[typing.Union[str, None]](
- default=None,
- semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
- description="Resource ID of a DataFrame to extract if there are multiple tabular resources inside a Dataset and none is a dataset entry point.",
- )
-
-
- class DatasetToDataFramePrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]):
- """
- A primitive which extracts a DataFrame out of a Dataset.
- """
-
- metadata = metadata_base.PrimitiveMetadata(
- {
- 'id': '4b42ce1e-9b98-4a25-b68e-fad13311eb65',
- 'version': '0.3.0',
- 'name': "Extract a DataFrame from a Dataset",
- 'python_path': 'd3m.primitives.tods.data_processing.dataset_to_dataframe',
- 'source': {
- 'name': common_primitives.__author__,
- 'contact': 'mailto:mitar.commonprimitives@tnode.com',
- 'uris': [
- 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/dataset_to_dataframe.py',
- 'https://gitlab.com/datadrivendiscovery/common-primitives.git',
- ],
- },
- 'installation': [{
- 'type': metadata_base.PrimitiveInstallationType.PIP,
- 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format(
- git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)),
- ),
- }],
- 'algorithm_types': [
- metadata_base.PrimitiveAlgorithmType.DATA_CONVERSION,
- ],
- 'primitive_family': metadata_base.PrimitiveFamily.DATA_TRANSFORMATION,
- },
- )
-
- def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]:
- dataframe_resource_id, dataframe = base_utils.get_tabular_resource(inputs, self.hyperparams['dataframe_resource'])
-
- dataframe.metadata = self._update_metadata(inputs.metadata, dataframe_resource_id)
-
- assert isinstance(dataframe, container.DataFrame), type(dataframe)
-
- return base.CallResult(dataframe)
-
- def _update_metadata(self, metadata: metadata_base.DataMetadata, resource_id: metadata_base.SelectorSegment) -> metadata_base.DataMetadata:
- resource_metadata = dict(metadata.query((resource_id,)))
-
- if 'structural_type' not in resource_metadata or not issubclass(resource_metadata['structural_type'], container.DataFrame):
- raise TypeError("The Dataset resource is not a DataFrame, but \"{type}\".".format(
- type=resource_metadata.get('structural_type', None),
- ))
-
- resource_metadata.update(
- {
- 'schema': metadata_base.CONTAINER_SCHEMA_VERSION,
- },
- )
-
- new_metadata = metadata_base.DataMetadata(resource_metadata)
-
- new_metadata = metadata.copy_to(new_metadata, (resource_id,))
-
- # Resource is not anymore an entry point.
- new_metadata = new_metadata.remove_semantic_type((), 'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint')
-
- return new_metadata
|