|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169 |
- import os
- import uuid
- import typing
- import collections
-
- import numpy as np
- import pandas as pd
-
- import common_primitives
- from common_primitives import dataframe_utils, utils
-
- from datetime import datetime, timezone
- from d3m.primitive_interfaces import base, transformer
- from d3m import container, exceptions, utils as d3m_utils
- from d3m.metadata import base as metadata_base, hyperparams
-
-
- __all__ = ('TimeIntervalTransform',)
-
- Inputs = container.DataFrame
- Outputs = container.DataFrame
-
-
- """
- TODO: Implementation for up-sampling the data (when time_interval is less than current time series interval)
- """
-
- class Hyperparams(hyperparams.Hyperparams):
- time_interval = hyperparams.Hyperparameter[typing.Union[str, None]](
- default='5T',
- semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
- description='timestamp to transform.'
- )
-
- # Keep previous
- dataframe_resource = hyperparams.Hyperparameter[typing.Union[str, None]](
- default=None,
- semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
- description="Resource ID of a DataFrame to extract if there are multiple tabular resources inside a Dataset and none is a dataset entry point.",
- )
- use_columns = hyperparams.Set(
- elements=hyperparams.Hyperparameter[int](-1),
- default=(2,),
- semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
- description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.",
- )
- exclude_columns = hyperparams.Set(
- elements=hyperparams.Hyperparameter[int](-1),
- default=(0,1,3,),
- semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
- description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.",
- )
- return_result = hyperparams.Enumeration(
- values=['append', 'replace', 'new'],
- default='new',
- semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
- description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.",
- )
- use_semantic_types = hyperparams.UniformBool(
- default=False,
- semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
- description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe"
- )
- add_index_columns = hyperparams.UniformBool(
- default=False,
- semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
- description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".",
- )
- error_on_no_input = hyperparams.UniformBool(
- default=True,
- semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
- description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.",
- )
- return_semantic_type = hyperparams.Enumeration[str](
- values=['https://metadata.datadrivendiscovery.org/types/Attribute',
- 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'],
- default='https://metadata.datadrivendiscovery.org/types/Attribute',
- description='Decides what semantic type to attach to generated attributes',
- semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter']
- )
-
-
-
- class TimeIntervalTransform(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]):
-
- """
- A primitive which configures the time interval of the dataframe.
- Resample the timestamps based on the time_interval passed as hyperparameter
- """
-
- metadata = metadata_base.PrimitiveMetadata({
- '__author__': "DATA Lab @Texas A&M University",
- 'name': "Time Interval Transform",
- 'python_path': 'd3m.primitives.tods.data_processing.time_interval_transform',
- 'source': {'name': "DATALAB @Taxes A&M University", 'contact': 'mailto:khlai037@tamu.edu',
- 'uris': ['https://gitlab.com/lhenry15/tods/-/blob/Yile/anomaly-primitives/anomaly_primitives/TimeIntervalTransform.py']},
- 'algorithm_types': [metadata_base.PrimitiveAlgorithmType.TIME_INTERVAL_TRANSFORM,],
- 'primitive_family': metadata_base.PrimitiveFamily.DATA_PREPROCESSING,
- 'id': str(uuid.uuid3(uuid.NAMESPACE_DNS, 'TimeIntervalTransformPrimitive')),
- 'hyperparams_to_tune': ['time_interval'],
- 'version': '0.0.2'
- })
-
-
- def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]:
- resource = inputs.reset_index(drop=True)
-
- """
- Args:
- inputs: Container DataFrame
- Returns:
- Container DataFrame with resampled time intervals
- """
-
- if self.hyperparams['time_interval'] is None:
- time_interval = '5T'
- else:
- time_interval = self.hyperparams['time_interval']
-
- try:
- outputs = self._time_interval_transform(inputs, hyperparams)
- #print(outputs)
- except Exception as e:
- self.logger.error("Error in Performing Time Interval Transform",e)
-
- self._update_metadata(outputs)
-
- return base.CallResult(outputs)
-
- def _time_interval_transform(self, inputs: Inputs, hyperparams: Hyperparams):
-
- """
- Args:
- inputs: Container DataFrame
- Returns:
- Container DataFrame with resampled time intervals
- """
-
- #configure dataframe for resampling
- inputs['timestamp'] = pd.to_datetime(inputs['timestamp'], unit='s')
- inputs['timestamp'] = inputs['timestamp'].dt.tz_localize('US/Pacific')
- inputs = inputs.set_index('timestamp')
-
- #resample dataframe
- inputs = inputs.resample(self.hyperparams['time_interval']).mean()
-
- #configure dataframe to original format
- inputs = inputs.reset_index()
- value_columns = list(set(inputs.columns) - set(['d3mIndex', 'timestamp', 'ground_truth']))
- inputs = inputs.reindex(columns=['d3mIndex','timestamp'] + value_columns + ['ground_truth'])
- inputs['timestamp'] = inputs['timestamp'].astype(np.int64) // 10 ** 9
- inputs['d3mIndex'] = range(0, len(inputs))
-
- """
- Since the mean of the ground_truth was taken for a set interval,
- we should set those values that are greater than 0 to 1 so they are consistent with original data
- """
- for i in range(len(inputs['ground_truth'])):
- if(inputs['ground_truth'][i] > 0):
- inputs.loc[i, 'ground_truth'] = 1
-
- inputs = container.DataFrame(inputs) #convert pandas DataFrame back to d3m comtainer(Important)
-
- return inputs
-
-
- def _update_metadata(self, outputs):
- outputs.metadata = outputs.metadata.generate(outputs)
-
|