|
- from d3m import container
- from d3m.primitive_interfaces import base, transformer
- from d3m.metadata import base as metadata_base, hyperparams
-
- import os.path
- from d3m import utils
-
- import time
-
- __all__ = ('DuplicationValidation',)
-
- Inputs = container.DataFrame
- Outputs = container.DataFrame
-
-
- class Hyperparams(hyperparams.Hyperparams):
- """
-
- """
- keep_option = hyperparams.Enumeration(
- values=['first', 'average'],
- default='first',
- semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
- description="When dropping rows, choose to keep the first one of duplicated data or calculate their average",
- )
-
-
- class DuplicationValidation(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]):
- """
- Check whether the seires data involves duplicate data in one timestamp, and provide processing if the duplication exists.
-
- Parameters
- ----------
- keep_option: enumeration
- When dropping rows, choose to keep the first one or calculate the average
- """
-
- __author__: "DATA Lab at Texas A&M University"
- metadata = metadata_base.PrimitiveMetadata({
- "name": "duplication validation primitive",
- "python_path": "d3m.primitives.tods.data_processing.duplication_validation",
- "source": {'name': 'DATA Lab at Texas A&M University', 'contact': 'mailto:khlai037@tamu.edu',
- 'uris': ['https://gitlab.com/lhenry15/tods.git', 'https://gitlab.com/lhenry15/tods/-/blob/Junjie/anomaly-primitives/anomaly_primitives/DuplicationValidation.py']},
- "algorithm_types": [metadata_base.PrimitiveAlgorithmType.DUPLICATION_VALIDATION,],
- "primitive_family": metadata_base.PrimitiveFamily.DATA_PREPROCESSING,
- "id": "cf6d8137-73d8-496e-a2e3-49f941ee716d",
- "hyperparams_to_tune": ['keep_option'],
- "version": "0.0.1",
- })
-
- def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]:
- """
- Args:
- inputs: Container DataFrame
- timeout: Default
- iterations: Default
-
- Returns:
- Container DataFrame after drop the duplication
- """
- # self.logger.warning('Hi, DuplicationValidation.produce was called!')
-
- if self.hyperparams['keep_option'] == 'first':
- outputs = self._timestamp_keep_first(inputs)
-
- if self.hyperparams['keep_option'] == 'average':
- outputs = self._timestamp_keep_average(inputs)
-
- self._update_metadata(outputs)
-
- # self._write(outputs)
- return base.CallResult(outputs)
-
- def _update_metadata(self, outputs):
- outputs.metadata = outputs.metadata.generate(outputs)
-
- def _timestamp_keep_first(self, inputs: Inputs):
- return inputs.drop_duplicates(subset=['timestamp'],keep='first')
-
- def _timestamp_keep_average(self, inputs: Inputs):
- inputs_copy = inputs.copy()
- inputs = inputs.drop_duplicates(subset=['timestamp'],keep='first')
-
- inputs_copy = inputs_copy.groupby('timestamp').mean().reset_index()
-
- for col in list(inputs.columns.values):
- if not col in ['d3mIndex', 'timestamp', 'ground_truth']:
-
- inputs[col] = inputs_copy[col].values
-
-
- return inputs
-
- def _write(self, inputs:Inputs):
- """
- write inputs to current directory, only for test
- """
- inputs.to_csv(str(time.time())+'.csv')
|