You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ContinuityValidation.py 7.0 kB

first commit Former-commit-id: 08bc23ba02cffbce3cf63962390a65459a132e48 [formerly 0795edd4834b9b7dc66db8d10d4cbaf42bbf82cb] [formerly b5010b42541add7e2ea2578bf2da537efc457757 [formerly a7ca09c2c34c4fc8b3d8e01fcfa08eeeb2cae99d]] [formerly 615058473a2177ca5b89e9edbb797f4c2a59c7e5 [formerly 743d8dfc6843c4c205051a8ab309fbb2116c895e] [formerly bb0ea98b1e14154ef464e2f7a16738705894e54b [formerly 960a69da74b81ef8093820e003f2d6c59a34974c]]] [formerly 2fa3be52c1b44665bc81a7cc7d4cea4bbf0d91d5 [formerly 2054589f0898627e0a17132fd9d4cc78efc91867] [formerly 3b53730e8a895e803dfdd6ca72bc05e17a4164c1 [formerly 8a2fa8ab7baf6686d21af1f322df46fd58c60e69]] [formerly 87d1e3a07a19d03c7d7c94d93ab4fa9f58dada7c [formerly f331916385a5afac1234854ee8d7f160f34b668f] [formerly 69fb3c78a483343f5071da4f7e2891b83a49dd18 [formerly 386086f05aa9487f65bce2ee54438acbdce57650]]]] Former-commit-id: a00aed8c934a6460c4d9ac902b9a74a3d6864697 [formerly 26fdeca29c2f07916d837883983ca2982056c78e] [formerly 0e3170d41a2f99ecf5c918183d361d4399d793bf [formerly 3c12ad4c88ac5192e0f5606ac0d88dd5bf8602dc]] [formerly d5894f84f2fd2e77a6913efdc5ae388cf1be0495 [formerly ad3e7bc670ff92c992730d29c9d3aa1598d844e8] [formerly 69fb3c78a483343f5071da4f7e2891b83a49dd18]] Former-commit-id: 3c19c9fae64f6106415fbc948a4dc613b9ee12f8 [formerly 467ddc0549c74bb007e8f01773bb6dc9103b417d] [formerly 5fa518345d958e2760e443b366883295de6d991c [formerly 3530e130b9fdb7280f638dbc2e785d2165ba82aa]] Former-commit-id: 9f5d473d42a435ec0d60149939d09be1acc25d92 [formerly be0b25c4ec2cde052a041baf0e11f774a158105d] Former-commit-id: 9eca71cb73ba9edccd70ac06a3b636b8d4093b04
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178
  1. from d3m import container, exceptions
  2. from d3m.primitive_interfaces import base, transformer
  3. from d3m.metadata import base as metadata_base, hyperparams
  4. import os.path
  5. from d3m import utils
  6. import time
  7. __all__ = ('ContinuityValidation',)
  8. Inputs = container.DataFrame
  9. Outputs = container.DataFrame
  10. class Hyperparams(hyperparams.Hyperparams):
  11. continuity_option = hyperparams.Enumeration(
  12. values=['ablation', 'imputation'],
  13. default='imputation',
  14. semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
  15. description="Choose ablation or imputation the original data",
  16. )
  17. interval = hyperparams.Uniform(
  18. default = 1,
  19. lower = 0.000000001,
  20. upper = 10000000000,
  21. description='Only used in imputation, give the timestamp interval.',
  22. semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
  23. )
  24. class ContinuityValidation(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]):
  25. """
  26. Check whether the seires data is consitent in time interval and provide processing if not consistent.
  27. Parameters
  28. ----------
  29. continuity_option: enumeration
  30. Choose ablation or imputation.
  31. ablation: delete some rows and increase timestamp interval to keep the timestamp consistent
  32. imputation: linearly imputate the absent timestamps to keep the timestamp consistent
  33. interval: float
  34. Only used in imputation, give the timestamp interval. ‘interval’ should be an integral multiple of 'timestamp' or 'timestamp' should be an integral multiple of ‘interval’
  35. """
  36. __author__: "DATA Lab at Texas A&M University"
  37. metadata = metadata_base.PrimitiveMetadata({
  38. "name": "continuity validation primitive",
  39. "python_path": "d3m.primitives.tods.data_processing.continuity_validation",
  40. "source": {'name': 'DATA Lab at Texas A&M University', 'contact': 'mailto:khlai037@tamu.edu',
  41. 'uris': ['https://gitlab.com/lhenry15/tods.git', 'https://gitlab.com/lhenry15/tods/-/blob/Junjie/anomaly-primitives/anomaly_primitives/ContinuityValidation.py']},
  42. "algorithm_types": [metadata_base.PrimitiveAlgorithmType.CONTINUITY_VALIDATION, ],
  43. "primitive_family": metadata_base.PrimitiveFamily.DATA_PREPROCESSING,
  44. "id": "ef8fb025-d157-476c-8e2e-f8fe56162195",
  45. "hyperparams_to_tune": ['continuity_option', 'interval'],
  46. "version": "0.0.1",
  47. })
  48. def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]:
  49. """
  50. Args:
  51. inputs: Container DataFrame
  52. timeout: Default
  53. iterations: Default
  54. Returns:
  55. Container DataFrame with consistent timestamp
  56. """
  57. # self.logger.warning('Hi, ContinuityValidation.produce was called!')
  58. if self.hyperparams['continuity_option'] == 'ablation':
  59. outputs = self._continuity_ablation(inputs)
  60. if self.hyperparams['continuity_option'] == 'imputation':
  61. outputs = self._continuity_imputation(inputs)
  62. outputs.reset_index(drop=True, inplace=True)
  63. self._update_metadata(outputs)
  64. # self._write(outputs)
  65. return base.CallResult(outputs)
  66. def _update_metadata(self, outputs):
  67. outputs.metadata = outputs.metadata.generate(outputs)
  68. def _continuity_ablation(self, inputs: Inputs):
  69. ablation_set = self._find_ablation_set(inputs)
  70. inputs = inputs.loc[inputs['timestamp'].isin(ablation_set)].copy()
  71. inputs.sort_values("timestamp",inplace=True)
  72. inputs['d3mIndex'] = list(range(inputs.shape[0]))
  73. return inputs
  74. def _find_ablation_set(self, inputs):
  75. """
  76. Find the longest series with minimum timestamp interval of inputs
  77. """
  78. # find the min inteval and max interval
  79. min_interval = inputs.iloc[1]['timestamp'] - inputs.iloc[0]['timestamp']
  80. for i in range(2, inputs.shape[0]):
  81. curr_interval = inputs.iloc[i]['timestamp'] - inputs.iloc[i - 1]['timestamp']
  82. if min_interval > curr_interval:
  83. min_interval = curr_interval
  84. max_interval = ((inputs.iloc[-1]['timestamp'] - inputs.iloc[0]['timestamp']) + min_interval * (2 - inputs.shape[0]))
  85. print((inputs.iloc[-1]['timestamp'] - inputs.iloc[0]['timestamp']), inputs.shape[0])
  86. interval = min_interval
  87. ablation_set = list()
  88. origin_set = set(inputs['timestamp'])
  89. print(min_interval, max_interval)
  90. while interval <= max_interval:
  91. start = 0
  92. while (inputs.iloc[start]['timestamp'] <= inputs.iloc[0]['timestamp'] + max_interval) and (inputs.iloc[start]['timestamp'] <= inputs.iloc[-1]['timestamp']):
  93. tmp_list = list()
  94. tmp = utils.numpy.arange(start=inputs.iloc[start]['timestamp'], step=interval,stop=inputs.iloc[-1]['timestamp'])
  95. for i in tmp:
  96. if i in origin_set:
  97. tmp_list.append(i)
  98. else: break
  99. ablation_set.append(tmp_list)
  100. start += 1
  101. interval += min_interval
  102. max_size_index = 0
  103. for i in range(1, len(ablation_set)):
  104. if len(ablation_set[i]) > len(ablation_set[max_size_index]):
  105. max_size_index = i
  106. return ablation_set[max_size_index]
  107. def _continuity_imputation(self, inputs: Inputs):
  108. """
  109. Linearly imputate the missing timestmap and value of inputs
  110. """
  111. interval = self.hyperparams['interval']
  112. time1 = inputs.iloc[0]['timestamp']
  113. for i in range(1, inputs.shape[0]):
  114. time2 = inputs.iloc[i]['timestamp']
  115. if time2 - time1 != interval:
  116. blank_number = int((time2 - time1) / interval) # how many imputation should there be between two timestamps in original data
  117. for j in range(1, blank_number):
  118. dict = {'timestamp':[time1 + interval * j], 'ground_truth':[int(inputs.iloc[i]['ground_truth'])]}
  119. for col in list(inputs.columns.values):
  120. if not col in ['d3mIndex', 'timestamp', 'ground_truth']:
  121. dict[col] = [inputs.iloc[i-1][col] + (inputs.iloc[i][col] - inputs.iloc[i-1][col]) / blank_number * j]
  122. inputs = inputs.append(utils.pandas.DataFrame(dict), ignore_index=True, sort=False)
  123. time1 = time2
  124. inputs.sort_values("timestamp",inplace=True)
  125. inputs['d3mIndex'] = list(range(inputs.shape[0]))
  126. return inputs
  127. def _write(self, inputs:Inputs):
  128. """
  129. write inputs to current directory, only for test
  130. """
  131. inputs.to_csv(str(time.time())+'.csv')

全栈的自动化机器学习系统,主要针对多变量时间序列数据的异常检测。TODS提供了详尽的用于构建基于机器学习的异常检测系统的模块,它们包括:数据处理(data processing),时间序列处理( time series processing),特征分析(feature analysis),检测算法(detection algorithms),和强化模块( reinforcement module)。这些模块所提供的功能包括常见的数据预处理、时间序列数据的平滑或变换,从时域或频域中抽取特征、多种多样的检测算