You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

TimeIntervalTransform.py 7.4 kB

first commit Former-commit-id: 08bc23ba02cffbce3cf63962390a65459a132e48 [formerly 0795edd4834b9b7dc66db8d10d4cbaf42bbf82cb] [formerly b5010b42541add7e2ea2578bf2da537efc457757 [formerly a7ca09c2c34c4fc8b3d8e01fcfa08eeeb2cae99d]] [formerly 615058473a2177ca5b89e9edbb797f4c2a59c7e5 [formerly 743d8dfc6843c4c205051a8ab309fbb2116c895e] [formerly bb0ea98b1e14154ef464e2f7a16738705894e54b [formerly 960a69da74b81ef8093820e003f2d6c59a34974c]]] [formerly 2fa3be52c1b44665bc81a7cc7d4cea4bbf0d91d5 [formerly 2054589f0898627e0a17132fd9d4cc78efc91867] [formerly 3b53730e8a895e803dfdd6ca72bc05e17a4164c1 [formerly 8a2fa8ab7baf6686d21af1f322df46fd58c60e69]] [formerly 87d1e3a07a19d03c7d7c94d93ab4fa9f58dada7c [formerly f331916385a5afac1234854ee8d7f160f34b668f] [formerly 69fb3c78a483343f5071da4f7e2891b83a49dd18 [formerly 386086f05aa9487f65bce2ee54438acbdce57650]]]] Former-commit-id: a00aed8c934a6460c4d9ac902b9a74a3d6864697 [formerly 26fdeca29c2f07916d837883983ca2982056c78e] [formerly 0e3170d41a2f99ecf5c918183d361d4399d793bf [formerly 3c12ad4c88ac5192e0f5606ac0d88dd5bf8602dc]] [formerly d5894f84f2fd2e77a6913efdc5ae388cf1be0495 [formerly ad3e7bc670ff92c992730d29c9d3aa1598d844e8] [formerly 69fb3c78a483343f5071da4f7e2891b83a49dd18]] Former-commit-id: 3c19c9fae64f6106415fbc948a4dc613b9ee12f8 [formerly 467ddc0549c74bb007e8f01773bb6dc9103b417d] [formerly 5fa518345d958e2760e443b366883295de6d991c [formerly 3530e130b9fdb7280f638dbc2e785d2165ba82aa]] Former-commit-id: 9f5d473d42a435ec0d60149939d09be1acc25d92 [formerly be0b25c4ec2cde052a041baf0e11f774a158105d] Former-commit-id: 9eca71cb73ba9edccd70ac06a3b636b8d4093b04
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169
  1. import os
  2. import uuid
  3. import typing
  4. import collections
  5. import numpy as np
  6. import pandas as pd
  7. import common_primitives
  8. from common_primitives import dataframe_utils, utils
  9. from datetime import datetime, timezone
  10. from d3m.primitive_interfaces import base, transformer
  11. from d3m import container, exceptions, utils as d3m_utils
  12. from d3m.metadata import base as metadata_base, hyperparams
  13. __all__ = ('TimeIntervalTransform',)
  14. Inputs = container.DataFrame
  15. Outputs = container.DataFrame
  16. """
  17. TODO: Implementation for up-sampling the data (when time_interval is less than current time series interval)
  18. """
  19. class Hyperparams(hyperparams.Hyperparams):
  20. time_interval = hyperparams.Hyperparameter[typing.Union[str, None]](
  21. default='5T',
  22. semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
  23. description='timestamp to transform.'
  24. )
  25. # Keep previous
  26. dataframe_resource = hyperparams.Hyperparameter[typing.Union[str, None]](
  27. default=None,
  28. semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
  29. description="Resource ID of a DataFrame to extract if there are multiple tabular resources inside a Dataset and none is a dataset entry point.",
  30. )
  31. use_columns = hyperparams.Set(
  32. elements=hyperparams.Hyperparameter[int](-1),
  33. default=(2,),
  34. semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
  35. description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.",
  36. )
  37. exclude_columns = hyperparams.Set(
  38. elements=hyperparams.Hyperparameter[int](-1),
  39. default=(0,1,3,),
  40. semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
  41. description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.",
  42. )
  43. return_result = hyperparams.Enumeration(
  44. values=['append', 'replace', 'new'],
  45. default='new',
  46. semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
  47. description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.",
  48. )
  49. use_semantic_types = hyperparams.UniformBool(
  50. default=False,
  51. semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
  52. description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe"
  53. )
  54. add_index_columns = hyperparams.UniformBool(
  55. default=False,
  56. semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
  57. description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".",
  58. )
  59. error_on_no_input = hyperparams.UniformBool(
  60. default=True,
  61. semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
  62. description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.",
  63. )
  64. return_semantic_type = hyperparams.Enumeration[str](
  65. values=['https://metadata.datadrivendiscovery.org/types/Attribute',
  66. 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'],
  67. default='https://metadata.datadrivendiscovery.org/types/Attribute',
  68. description='Decides what semantic type to attach to generated attributes',
  69. semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter']
  70. )
  71. class TimeIntervalTransform(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]):
  72. """
  73. A primitive which configures the time interval of the dataframe.
  74. Resample the timestamps based on the time_interval passed as hyperparameter
  75. """
  76. metadata = metadata_base.PrimitiveMetadata({
  77. '__author__': "DATA Lab @Texas A&M University",
  78. 'name': "Time Interval Transform",
  79. 'python_path': 'd3m.primitives.tods.data_processing.time_interval_transform',
  80. 'source': {'name': "DATALAB @Taxes A&M University", 'contact': 'mailto:khlai037@tamu.edu',
  81. 'uris': ['https://gitlab.com/lhenry15/tods/-/blob/Yile/anomaly-primitives/anomaly_primitives/TimeIntervalTransform.py']},
  82. 'algorithm_types': [metadata_base.PrimitiveAlgorithmType.TIME_INTERVAL_TRANSFORM,],
  83. 'primitive_family': metadata_base.PrimitiveFamily.DATA_PREPROCESSING,
  84. 'id': str(uuid.uuid3(uuid.NAMESPACE_DNS, 'TimeIntervalTransformPrimitive')),
  85. 'hyperparams_to_tune': ['time_interval'],
  86. 'version': '0.0.2'
  87. })
  88. def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]:
  89. resource = inputs.reset_index(drop=True)
  90. """
  91. Args:
  92. inputs: Container DataFrame
  93. Returns:
  94. Container DataFrame with resampled time intervals
  95. """
  96. if self.hyperparams['time_interval'] is None:
  97. time_interval = '5T'
  98. else:
  99. time_interval = self.hyperparams['time_interval']
  100. try:
  101. outputs = self._time_interval_transform(inputs, hyperparams)
  102. #print(outputs)
  103. except Exception as e:
  104. self.logger.error("Error in Performing Time Interval Transform",e)
  105. self._update_metadata(outputs)
  106. return base.CallResult(outputs)
  107. def _time_interval_transform(self, inputs: Inputs, hyperparams: Hyperparams):
  108. """
  109. Args:
  110. inputs: Container DataFrame
  111. Returns:
  112. Container DataFrame with resampled time intervals
  113. """
  114. #configure dataframe for resampling
  115. inputs['timestamp'] = pd.to_datetime(inputs['timestamp'], unit='s')
  116. inputs['timestamp'] = inputs['timestamp'].dt.tz_localize('US/Pacific')
  117. inputs = inputs.set_index('timestamp')
  118. #resample dataframe
  119. inputs = inputs.resample(self.hyperparams['time_interval']).mean()
  120. #configure dataframe to original format
  121. inputs = inputs.reset_index()
  122. value_columns = list(set(inputs.columns) - set(['d3mIndex', 'timestamp', 'ground_truth']))
  123. inputs = inputs.reindex(columns=['d3mIndex','timestamp'] + value_columns + ['ground_truth'])
  124. inputs['timestamp'] = inputs['timestamp'].astype(np.int64) // 10 ** 9
  125. inputs['d3mIndex'] = range(0, len(inputs))
  126. """
  127. Since the mean of the ground_truth was taken for a set interval,
  128. we should set those values that are greater than 0 to 1 so they are consistent with original data
  129. """
  130. for i in range(len(inputs['ground_truth'])):
  131. if(inputs['ground_truth'][i] > 0):
  132. inputs.loc[i, 'ground_truth'] = 1
  133. inputs = container.DataFrame(inputs) #convert pandas DataFrame back to d3m comtainer(Important)
  134. return inputs
  135. def _update_metadata(self, outputs):
  136. outputs.metadata = outputs.metadata.generate(outputs)

全栈的自动化机器学习系统,主要针对多变量时间序列数据的异常检测。TODS提供了详尽的用于构建基于机器学习的异常检测系统的模块,它们包括:数据处理(data processing),时间序列处理( time series processing),特征分析(feature analysis),检测算法(detection algorithms),和强化模块( reinforcement module)。这些模块所提供的功能包括常见的数据预处理、时间序列数据的平滑或变换,从时域或频域中抽取特征、多种多样的检测算