|
- from d3m import container, exceptions
- from d3m.primitive_interfaces import base, transformer
- from d3m.metadata import base as metadata_base, hyperparams
-
- import os.path
- from d3m import utils
-
- import time
-
- __all__ = ('ContinuityValidation',)
-
- Inputs = container.DataFrame
- Outputs = container.DataFrame
-
-
- class Hyperparams(hyperparams.Hyperparams):
- continuity_option = hyperparams.Enumeration(
- values=['ablation', 'imputation'],
- default='imputation',
- semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
- description="Choose ablation or imputation the original data",
- )
-
- interval = hyperparams.Uniform(
- default = 1,
- lower = 0.000000001,
- upper = 10000000000,
- description='Only used in imputation, give the timestamp interval.',
- semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
- )
-
-
- class ContinuityValidation(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]):
- """
- Check whether the seires data is consitent in time interval and provide processing if not consistent.
-
- Parameters
- ----------
- continuity_option: enumeration
- Choose ablation or imputation.
- ablation: delete some rows and increase timestamp interval to keep the timestamp consistent
- imputation: linearly imputate the absent timestamps to keep the timestamp consistent
- interval: float
- Only used in imputation, give the timestamp interval. ‘interval’ should be an integral multiple of 'timestamp' or 'timestamp' should be an integral multiple of ‘interval’
- """
-
- __author__: "DATA Lab at Texas A&M University"
- metadata = metadata_base.PrimitiveMetadata({
- "name": "continuity validation primitive",
- "python_path": "d3m.primitives.tods.data_processing.continuity_validation",
- "source": {'name': 'DATA Lab at Texas A&M University', 'contact': 'mailto:khlai037@tamu.edu',
- 'uris': ['https://gitlab.com/lhenry15/tods.git', 'https://gitlab.com/lhenry15/tods/-/blob/Junjie/anomaly-primitives/anomaly_primitives/ContinuityValidation.py']},
- "algorithm_types": [metadata_base.PrimitiveAlgorithmType.CONTINUITY_VALIDATION, ],
- "primitive_family": metadata_base.PrimitiveFamily.DATA_PREPROCESSING,
- "id": "ef8fb025-d157-476c-8e2e-f8fe56162195",
- "hyperparams_to_tune": ['continuity_option', 'interval'],
- "version": "0.0.1",
- })
-
- def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]:
- """
- Args:
- inputs: Container DataFrame
- timeout: Default
- iterations: Default
-
- Returns:
- Container DataFrame with consistent timestamp
-
- """
- # self.logger.warning('Hi, ContinuityValidation.produce was called!')
- if self.hyperparams['continuity_option'] == 'ablation':
- outputs = self._continuity_ablation(inputs)
-
- if self.hyperparams['continuity_option'] == 'imputation':
- outputs = self._continuity_imputation(inputs)
-
-
- outputs.reset_index(drop=True, inplace=True)
- self._update_metadata(outputs)
-
- # self._write(outputs)
- return base.CallResult(outputs)
-
-
- def _update_metadata(self, outputs):
- outputs.metadata = outputs.metadata.generate(outputs)
-
-
- def _continuity_ablation(self, inputs: Inputs):
-
- ablation_set = self._find_ablation_set(inputs)
- inputs = inputs.loc[inputs['timestamp'].isin(ablation_set)].copy()
-
- inputs.sort_values("timestamp",inplace=True)
- inputs['d3mIndex'] = list(range(inputs.shape[0]))
-
- return inputs
-
-
- def _find_ablation_set(self, inputs):
- """
- Find the longest series with minimum timestamp interval of inputs
- """
- # find the min inteval and max interval
- min_interval = inputs.iloc[1]['timestamp'] - inputs.iloc[0]['timestamp']
- for i in range(2, inputs.shape[0]):
- curr_interval = inputs.iloc[i]['timestamp'] - inputs.iloc[i - 1]['timestamp']
- if min_interval > curr_interval:
- min_interval = curr_interval
-
- max_interval = ((inputs.iloc[-1]['timestamp'] - inputs.iloc[0]['timestamp']) + min_interval * (2 - inputs.shape[0]))
-
- print((inputs.iloc[-1]['timestamp'] - inputs.iloc[0]['timestamp']), inputs.shape[0])
-
- interval = min_interval
- ablation_set = list()
- origin_set = set(inputs['timestamp'])
-
- print(min_interval, max_interval)
-
- while interval <= max_interval:
- start = 0
- while (inputs.iloc[start]['timestamp'] <= inputs.iloc[0]['timestamp'] + max_interval) and (inputs.iloc[start]['timestamp'] <= inputs.iloc[-1]['timestamp']):
- tmp_list = list()
- tmp = utils.numpy.arange(start=inputs.iloc[start]['timestamp'], step=interval,stop=inputs.iloc[-1]['timestamp'])
-
- for i in tmp:
- if i in origin_set:
- tmp_list.append(i)
- else: break
-
- ablation_set.append(tmp_list)
- start += 1
-
- interval += min_interval
-
- max_size_index = 0
- for i in range(1, len(ablation_set)):
- if len(ablation_set[i]) > len(ablation_set[max_size_index]):
- max_size_index = i
- return ablation_set[max_size_index]
-
-
- def _continuity_imputation(self, inputs: Inputs):
- """
- Linearly imputate the missing timestmap and value of inputs
- """
- interval = self.hyperparams['interval']
- time1 = inputs.iloc[0]['timestamp']
-
- for i in range(1, inputs.shape[0]):
- time2 = inputs.iloc[i]['timestamp']
- if time2 - time1 != interval:
-
- blank_number = int((time2 - time1) / interval) # how many imputation should there be between two timestamps in original data
- for j in range(1, blank_number):
-
- dict = {'timestamp':[time1 + interval * j], 'ground_truth':[int(inputs.iloc[i]['ground_truth'])]}
-
- for col in list(inputs.columns.values):
- if not col in ['d3mIndex', 'timestamp', 'ground_truth']:
- dict[col] = [inputs.iloc[i-1][col] + (inputs.iloc[i][col] - inputs.iloc[i-1][col]) / blank_number * j]
-
- inputs = inputs.append(utils.pandas.DataFrame(dict), ignore_index=True, sort=False)
-
- time1 = time2
-
- inputs.sort_values("timestamp",inplace=True)
- inputs['d3mIndex'] = list(range(inputs.shape[0]))
- return inputs
-
-
- def _write(self, inputs:Inputs):
- """
- write inputs to current directory, only for test
- """
- inputs.to_csv(str(time.time())+'.csv')
|