import os import typing from d3m import container, utils as d3m_utils from d3m.base import utils as base_utils from d3m.metadata import base as metadata_base, hyperparams from d3m.primitive_interfaces import base, transformer import common_primitives __all__ = ('DatasetToDataFramePrimitive',) Inputs = container.Dataset Outputs = container.DataFrame class Hyperparams(hyperparams.Hyperparams): dataframe_resource = hyperparams.Hyperparameter[typing.Union[str, None]]( default=None, semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], description="Resource ID of a DataFrame to extract if there are multiple tabular resources inside a Dataset and none is a dataset entry point.", ) class DatasetToDataFramePrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): """ A primitive which extracts a DataFrame out of a Dataset. """ metadata = metadata_base.PrimitiveMetadata( { 'id': '4b42ce1e-9b98-4a25-b68e-fad13311eb65', 'version': '0.3.0', 'name': "Extract a DataFrame from a Dataset", 'python_path': 'd3m.primitives.tods.data_processing.dataset_to_dataframe', 'source': { 'name': common_primitives.__author__, 'contact': 'mailto:mitar.commonprimitives@tnode.com', 'uris': [ 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/dataset_to_dataframe.py', 'https://gitlab.com/datadrivendiscovery/common-primitives.git', ], }, 'installation': [{ 'type': metadata_base.PrimitiveInstallationType.PIP, 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), ), }], 'algorithm_types': [ metadata_base.PrimitiveAlgorithmType.DATA_CONVERSION, ], 'primitive_family': metadata_base.PrimitiveFamily.DATA_TRANSFORMATION, }, ) def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: dataframe_resource_id, dataframe = base_utils.get_tabular_resource(inputs, self.hyperparams['dataframe_resource']) dataframe.metadata = self._update_metadata(inputs.metadata, dataframe_resource_id) assert isinstance(dataframe, container.DataFrame), type(dataframe) return base.CallResult(dataframe) def _update_metadata(self, metadata: metadata_base.DataMetadata, resource_id: metadata_base.SelectorSegment) -> metadata_base.DataMetadata: resource_metadata = dict(metadata.query((resource_id,))) if 'structural_type' not in resource_metadata or not issubclass(resource_metadata['structural_type'], container.DataFrame): raise TypeError("The Dataset resource is not a DataFrame, but \"{type}\".".format( type=resource_metadata.get('structural_type', None), )) resource_metadata.update( { 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, }, ) new_metadata = metadata_base.DataMetadata(resource_metadata) new_metadata = metadata.copy_to(new_metadata, (resource_id,)) # Resource is not anymore an entry point. new_metadata = new_metadata.remove_semantic_type((), 'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint') return new_metadata