import os import uuid import typing import collections import numpy as np import pandas as pd import common_primitives from common_primitives import dataframe_utils, utils from datetime import datetime, timezone from d3m.primitive_interfaces import base, transformer from d3m import container, exceptions, utils as d3m_utils from d3m.metadata import base as metadata_base, hyperparams __all__ = ('TimeIntervalTransform',) Inputs = container.DataFrame Outputs = container.DataFrame """ TODO: Implementation for up-sampling the data (when time_interval is less than current time series interval) """ class Hyperparams(hyperparams.Hyperparams): time_interval = hyperparams.Hyperparameter[typing.Union[str, None]]( default='5T', semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], description='timestamp to transform.' ) # Keep previous dataframe_resource = hyperparams.Hyperparameter[typing.Union[str, None]]( default=None, semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], description="Resource ID of a DataFrame to extract if there are multiple tabular resources inside a Dataset and none is a dataset entry point.", ) use_columns = hyperparams.Set( elements=hyperparams.Hyperparameter[int](-1), default=(2,), semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", ) exclude_columns = hyperparams.Set( elements=hyperparams.Hyperparameter[int](-1), default=(0,1,3,), semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", ) return_result = hyperparams.Enumeration( values=['append', 'replace', 'new'], default='new', semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", ) use_semantic_types = hyperparams.UniformBool( default=False, semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" ) add_index_columns = hyperparams.UniformBool( default=False, semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", ) error_on_no_input = hyperparams.UniformBool( default=True, semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", ) return_semantic_type = hyperparams.Enumeration[str]( values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], default='https://metadata.datadrivendiscovery.org/types/Attribute', description='Decides what semantic type to attach to generated attributes', semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] ) class TimeIntervalTransform(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): """ A primitive which configures the time interval of the dataframe. Resample the timestamps based on the time_interval passed as hyperparameter """ metadata = metadata_base.PrimitiveMetadata({ '__author__': "DATA Lab @Texas A&M University", 'name': "Time Interval Transform", 'python_path': 'd3m.primitives.tods.data_processing.time_interval_transform', 'source': {'name': "DATALAB @Taxes A&M University", 'contact': 'mailto:khlai037@tamu.edu', 'uris': ['https://gitlab.com/lhenry15/tods/-/blob/Yile/anomaly-primitives/anomaly_primitives/TimeIntervalTransform.py']}, 'algorithm_types': [metadata_base.PrimitiveAlgorithmType.TIME_INTERVAL_TRANSFORM,], 'primitive_family': metadata_base.PrimitiveFamily.DATA_PREPROCESSING, 'id': str(uuid.uuid3(uuid.NAMESPACE_DNS, 'TimeIntervalTransformPrimitive')), 'hyperparams_to_tune': ['time_interval'], 'version': '0.0.2' }) def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: resource = inputs.reset_index(drop=True) """ Args: inputs: Container DataFrame Returns: Container DataFrame with resampled time intervals """ if self.hyperparams['time_interval'] is None: time_interval = '5T' else: time_interval = self.hyperparams['time_interval'] try: outputs = self._time_interval_transform(inputs, hyperparams) #print(outputs) except Exception as e: self.logger.error("Error in Performing Time Interval Transform",e) self._update_metadata(outputs) return base.CallResult(outputs) def _time_interval_transform(self, inputs: Inputs, hyperparams: Hyperparams): """ Args: inputs: Container DataFrame Returns: Container DataFrame with resampled time intervals """ #configure dataframe for resampling inputs['timestamp'] = pd.to_datetime(inputs['timestamp'], unit='s') inputs['timestamp'] = inputs['timestamp'].dt.tz_localize('US/Pacific') inputs = inputs.set_index('timestamp') #resample dataframe inputs = inputs.resample(self.hyperparams['time_interval']).mean() #configure dataframe to original format inputs = inputs.reset_index() value_columns = list(set(inputs.columns) - set(['d3mIndex', 'timestamp', 'ground_truth'])) inputs = inputs.reindex(columns=['d3mIndex','timestamp'] + value_columns + ['ground_truth']) inputs['timestamp'] = inputs['timestamp'].astype(np.int64) // 10 ** 9 inputs['d3mIndex'] = range(0, len(inputs)) """ Since the mean of the ground_truth was taken for a set interval, we should set those values that are greater than 0 to 1 so they are consistent with original data """ for i in range(len(inputs['ground_truth'])): if(inputs['ground_truth'][i] > 0): inputs.loc[i, 'ground_truth'] = 1 inputs = container.DataFrame(inputs) #convert pandas DataFrame back to d3m comtainer(Important) return inputs def _update_metadata(self, outputs): outputs.metadata = outputs.metadata.generate(outputs)