From 56fb66b20ab65cc1c5accbf870b538757b6f973f Mon Sep 17 00:00:00 2001 From: Daochen Zha Date: Tue, 15 Sep 2020 19:34:16 -0500 Subject: [PATCH] Simplify dataset interface and return results in evaluation Former-commit-id: 7f5f4fdeff2cab352bf48eee1997bb685080e03c [formerly d1fbab262d72d07456a02f59ae16809aa648d99a] [formerly 0daaf199a46360e5afdb429aab9ddd82c5dfa1b0 [formerly ce0b30a55c9a43102636ed8f724e0b0801e6628b]] [formerly 269ea6cef2c74d00c76c6ceed188fe5ec8eceed0 [formerly 4a0c281aaaab1af794bd387ba03552088c3149ac] [formerly 5f0ee1311fd4554dbb33288d99a485b028df867e [formerly 1f19e41f6edf005debe9f4365580118104c04e39]]] [formerly 47b61676cf78d12de768c50f3e9ad467484ae05b [formerly d170f442b4608a113ec99cf92f4ab7d97330bd66] [formerly 2d52007adf85922f71abb10b47a9d052231de0f1 [formerly 4c76595f5d37c2a3e8e76176424b9f2b35486dd6]] [formerly 3a1686558cf4deab2ebac50ebfcf14174e6eddab [formerly 7d8c5f00d00cbef99132ecf22ab72022cb2c8f7c] [formerly 13952f711d7d49dadf9114ca601fd0a2ebad7c8f [formerly 40935732ede426e81f46958be128b5bde0e6f435]]]] [formerly ee839034e075a025e82b3e850bab5464145284a8 [formerly a3eb29062cb7dcc57ab1daa6c0ee1948ba2feabc] [formerly 7a19be0e9b827e1904aee632b5a0c5ec5c17568d [formerly 3ad8bb47d6c065b1f8605f41cea4ecbe7e1f3c6d]] [formerly 4e64c258edaa049a585d583f7a8f204adb56133b [formerly 7cab05b9424dcd2ceecaba1ad0409923a10842ff] [formerly c01c4b5ab8cbb8d783a4346d07854f208ee18ed6 [formerly 8a0ffa31f8455a7c1ddafda850bca7fb14fc30ec]]] [formerly e5c827f3b0011ef848a3d309b4be6b23b525deb6 [formerly 66f6a5e88290cc10cf92a8ddd2f32ffec1879a19] [formerly 917b218a7269f60d6b5fbcdf2ca995f07a54ef83 [formerly 259a497af0e15e16cff56095f776cd9d18e90c84]] [formerly baf9af505d146726d76a0f63eef72ffb4690eb7f [formerly 0c3a6024c3f55c4e14b0ee989d162d96d1ea3654] [formerly 8b317a7990e3a4e12fd29bb650630f2a8b96e828 [formerly 92f001166b8af08b4b67c1cffa3cb899b70937bd]]]]] [formerly 81cd5c004191c5a354920a2ed9927a5ea0e0f4f4 [formerly d1803c1a9299a97f848b8d74579d6135f372f8d1] [formerly 770cfb90a2a503156e102eb3d97e0468f45a66d0 [formerly 4ecb45934545e5fad67f0ae2ca3ad2b2967c9561]] [formerly 11a8644180e0d713075c40961a18a86ebd25bb19 [formerly 60943bca182779f3f692745b120a88e4a942ca1d] [formerly fea29803bf15a9022fa90f064edbb9261a251263 [formerly 42056da4f8fd94ed48b023e353fb624a7cecd20b]]] [formerly cd5cde3f126376c5207831de03cb2ffb30d99c4a [formerly 7e14151b0bd4a2e1b726ce973640eafbab2c763e] [formerly 9efc26abe9f11a900ba7c87958b816e986649c24 [formerly 583fc8220603eb6b63d9a0e44266c0f5af5093ff]] [formerly 311b7db466c6edef50aff9c4576c2628bee62000 [formerly b73ec9c9d683cc153fd66ccdae080355baa95583] [formerly 73fcc752f64512c47d958b1c144b8ef40b4ba928 [formerly ea6b562689269975a5a8a21f8202bbc24c8fa382]]]] [formerly 9870a026c797ef89534662a09ae34d699e831ac5 [formerly d9dd3cb7efcd0216704822e01c50376a80c99c9a] [formerly 58769d6ac67385cd6848e68944cd022aefa0d574 [formerly 0c0373be5a5f2eb408d6854fe35f26d95cc06363]] [formerly 3c62af5a8f10e605630b3db1a4741690eac39cec [formerly 5847362015cc8d4f39fec8a8641d0ea8f36d7b40] [formerly e284c3bc1b729fffa102222383676013680a8eb1 [formerly 60b26ababc756aa5582977e952791144cdbfa54a]]] [formerly cd9bbd04faa7700c89913fa3d74d4bfdc7f68c71 [formerly 80477de5e8ee7f23ec18a58c72874747e446fd64] [formerly 2d502d41f9f3f0d55854d631194492f951b6e19a [formerly 054acf2933cd4219aec4885949c65b83f8857306]] [formerly b25127cc27bd445318f1fd5ba4a82916d01ad4e6 [formerly 13fdd7fbe0b0ca5656491baf97e563bd9ebf95c1] [formerly 3e00ce70c291a5e7743f6063739b785247ba1da5 [formerly a8af4837df65a99875d21e7033a259579fe5a825]]]]]] Former-commit-id: b9cc8adf37128b27621b6ff11bef821bdffeb060 [formerly c5309a28eca2fb214e9898bddeffac67d77d1708] [formerly 8c5684d7306e018225920a6f507c0a0a088ee016 [formerly 4c3967fef1824599fd8dcde651674a7177183cde]] [formerly 0297be1ceceb392b9ada2ed66ce97f05284f91cb [formerly c0a2675486cf1a614417c108cb99d9e4a491dc61] [formerly 9be3285b4bdc585cb06ea30fb778a05c963ebc7d [formerly db084eb7330a21514908c1c81c28d9ee4ee9bfa3]]] [formerly e2d2b8030d69ff37ef1b798151bb6ffeb50c03d5 [formerly 4700ffd4337d07080b98475a175f5252a582be2a] [formerly 69ff9f8352d8476f0047888764c5967c86a27f54 [formerly 988e4501f339a08de6eb4f05f63831b5968e066c]] [formerly a63177b969dae48372d02fb6f4f804126f5e5711 [formerly 012633f283d4803619a86991a64a7c4991250a58] [formerly 6cfdeec2e4ec1ab9c0d4c4381a08cca1d4c16d2e [formerly fa85a63cb9f15020d8363e8bc7740c2eecc31bd7]]]] [formerly 14218999e9bb7dbcc77d4b4439666e142de22b1f [formerly aebd4fd7b7092787a818e9d8024e8528bd0f5c79] [formerly 7a94bac70d479a5ff8def19f03cda4171f23b407 [formerly 38525ca0f6ff509cd35235e613f256b8adacb01c]] [formerly 9a5f056bb6e121e82bb4628bc9f63b3274125324 [formerly 36e0e7832231ff3b7516ff2541198469360bc4dc] [formerly 956e79a19ec9a262f0ab138d6591dfb4c255adab [formerly 40bb07c6803a69f09bcffccfdecdd16534a3594d]]] [formerly f647735a811bfe883d340f6645723e5105f8b87f [formerly 3a569a5afdf1d53aabc8f53b4173d91b90337947] [formerly 3845b905c8a7ed065b3cccf65165ce46cb4c9294 [formerly 6ac7882a40c097bd126757404f9629dbf9f83d0b]] [formerly f3d0673d6d7afeea30f5635844b77972da5082ed [formerly 1adec1f3c02e1965679846dddae795cb5caa0718] [formerly 3e00ce70c291a5e7743f6063739b785247ba1da5]]]] Former-commit-id: 9f083d9f48ea0166bebb149a7ba620b59ed446c1 [formerly 818a4e042224f2708bdc506bc64c8ec4a69f1e3a] [formerly e15a115ebc4dc5cee248aae5932837d5fde69c6f [formerly 7a5047d2769b7d24a5096d341caf87c0fa3a0a6d]] [formerly 35f5b04bfe1938b00a48f16c86ac5128812f44f3 [formerly 391036b1fb3a8cf6f01321bfe7e668f353bf6ecb] [formerly 2b02d5630e266b4cfa63ed30ba9446f33943e7b7 [formerly c8c40b8492932a0457073d98d5bc3b5c0af8edbe]]] [formerly e59a2371894c561bfe7544547323a430d9b0f154 [formerly 7991a3a56e31aad1ccca4b7adee99929d2876be1] [formerly 4e2fec05244d9a6f0ad09426dc18c616019f7171 [formerly 93c110b8b02a4a8baca1af234bbb322d22888237]] [formerly dc8c08c0194b0252c9a35055496ea1242cae0a78 [formerly e6fa4eaa57b3f8a8516566884bbd7a12519128b5] [formerly 98a89c92863b5fe5e57f6e4d84d5cf51597bf2a4 [formerly 86ee4f5d85bc7f7da87a888c208071a7df6ef344]]]] Former-commit-id: 73c340fb7225f8cb739a476db34a363248ec552f [formerly 9ea3e024833aa7967e9bc33c30289fce017ceeea] [formerly 80681084f82f467ca13553684894443d7047a877 [formerly 221e36522ff449cb31884e8b3e4d7b02eade75b3]] [formerly 99e0bc9871c94e90d24eb05e4e361aa94fbb1cf0 [formerly b9937a5c3ef9f60ffe55e85b63a175f242c7de6b] [formerly aa68f881053b970b18780c3e056e62357db266f3 [formerly 9575da9fc6845e78fba25088db28f4958440cf4e]]] Former-commit-id: dce6a3eae7b20d70af1c1b39deaeffb493737e17 [formerly 386f12a53c41a73a6bbf8afea03dd05230421561] [formerly 7102bc3d492c0721f337f3c6782f7162c66b40be [formerly e58449ea19cd0ccd8ecdf4011a3fbfbb37d706f1]] Former-commit-id: 873278d7712e30538879f32d5f5ffb8abed7a5f1 [formerly 9291014ea6dd9a8c60d48cb3c7dd90991642c496] Former-commit-id: cd3cf6574039e314f2bdd77184aab303a797d641 --- axolotl/axolotl/backend/simple.py | 1 + examples/run_pipeline.py | 20 +++--- examples/{run_automl.py => run_search.py} | 12 ++-- tods/searcher/__init__.py | 0 tods/searcher/utils.py | 51 ------------- tods/tods/__init__.py | 1 + .../resources/default_pipeline.json | 0 tods/{searcher => tods}/schemas.py | 0 .../search => tods/searcher}/__init__.py | 0 .../searcher}/brute_force_search.py | 0 tods/{searcher => }/tods/utils.py | 72 +++++++++++++------ 11 files changed, 68 insertions(+), 89 deletions(-) rename examples/{run_automl.py => run_search.py} (83%) delete mode 100644 tods/searcher/__init__.py delete mode 100644 tods/searcher/utils.py create mode 100644 tods/tods/__init__.py rename tods/{searcher => tods}/resources/default_pipeline.json (100%) rename tods/{searcher => tods}/schemas.py (100%) rename tods/{searcher/search => tods/searcher}/__init__.py (100%) rename tods/{searcher/search => tods/searcher}/brute_force_search.py (100%) rename tods/{searcher => }/tods/utils.py (57%) diff --git a/axolotl/axolotl/backend/simple.py b/axolotl/axolotl/backend/simple.py index 2d6b9ad..9e058c5 100644 --- a/axolotl/axolotl/backend/simple.py +++ b/axolotl/axolotl/backend/simple.py @@ -172,6 +172,7 @@ class SimpleRunner(RunnerBase): else: pipeline_result.status = "COMPLETED" pipeline_result.scores = runtime_module.combine_folds(scores) + pipeline_result.outputs = [result.values for result in results] self.request_results[request_id] = pipeline_result return request_id diff --git a/examples/run_pipeline.py b/examples/run_pipeline.py index ae9d8ec..ea36682 100644 --- a/examples/run_pipeline.py +++ b/examples/run_pipeline.py @@ -1,11 +1,9 @@ -import pandas as pd import sys import argparse - -from searcher import schemas as schemas_utils -from searcher.utils import generate_dataset_problem, evaluate_pipeline -from axolotl.utils import pipeline as pipeline_utils import os +import pandas as pd + +from tods import generate_dataset, load_pipeline, evaluate_pipeline this_path = os.path.dirname(os.path.abspath(__file__)) #table_path = 'datasets/NAB/realTweets/labeled_Twitter_volume_IBM.csv' # The path of the dataset @@ -17,7 +15,7 @@ parser.add_argument('--target_index', type=int, default=6, help='Index of the ground truth (for evaluation)') parser.add_argument('--metric',type=str, default='F1_MACRO', help='Evaluation Metric (F1, F1_MACRO)') -parser.add_argument('--pipeline_path', default=os.path.join(this_path, '../tods/searcher/resources/default_pipeline.json'), +parser.add_argument('--pipeline_path', default=os.path.join(this_path, '../tods/tods/resources/default_pipeline.json'), help='Input the path of the pre-built pipeline description') args = parser.parse_args() @@ -27,16 +25,14 @@ target_index = args.target_index # what column is the target pipeline_path = args.pipeline_path metric = args.metric # F1 on both label 0 and 1 -time_limit = 30 # How many seconds you wanna search - -# Read data and generate dataset and problem +# Read data and generate dataset df = pd.read_csv(table_path) -dataset, problem_description = generate_dataset_problem(df, target_index=target_index, metric=metric) +dataset = generate_dataset(df, target_index) # Load the default pipeline -pipeline = pipeline_utils.load_pipeline(pipeline_path) +pipeline = load_pipeline(pipeline_path) # Run the pipeline -pipeline_result = evaluate_pipeline(problem_description, dataset, pipeline) +pipeline_result = evaluate_pipeline(dataset, pipeline, metric) print(pipeline_result) diff --git a/examples/run_automl.py b/examples/run_search.py similarity index 83% rename from examples/run_automl.py rename to examples/run_search.py index dd54b7c..2bc0c01 100644 --- a/examples/run_automl.py +++ b/examples/run_search.py @@ -2,8 +2,8 @@ import pandas as pd from axolotl.backend.simple import SimpleRunner -from searcher.utils import generate_dataset_problem -from searcher.search import BruteForceSearch +from tods import generate_dataset, generate_problem +from tods.searcher import BruteForceSearch # Some information #table_path = 'datasets/NAB/realTweets/labeled_Twitter_volume_GOOG.csv' # The path of the dataset @@ -19,13 +19,15 @@ metric = 'F1_MACRO' # F1 on both label 0 and 1 # Read data and generate dataset and problem df = pd.read_csv(table_path) -dataset, problem_description = generate_dataset_problem(df, target_index=target_index, metric=metric) +dataset = generate_dataset(df, target_index=target_index) +problem_description = generate_problem(dataset, metric) # Start backend -backend = SimpleRunner(random_seed=42) +backend = SimpleRunner(random_seed=0) # Start search algorithm -search = BruteForceSearch(problem_description=problem_description, backend=backend) +search = BruteForceSearch(problem_description=problem_description, + backend=backend) # Find the best pipeline best_runtime, best_pipeline_result = search.search_fit(input_data=[dataset], time_limit=time_limit) diff --git a/tods/searcher/__init__.py b/tods/searcher/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tods/searcher/utils.py b/tods/searcher/utils.py deleted file mode 100644 index e375354..0000000 --- a/tods/searcher/utils.py +++ /dev/null @@ -1,51 +0,0 @@ - -def generate_dataset_problem(df, target_index, metric): - """ - A wrapper for generating dataset and problem - - Args: - df (pandas.DataFrame): dataset - target_index (int): The column index of the target - metric (str): `F1` for computing F1 on label 1, 'F1_MACRO` for - macro-F1 on both 0 and 1 - - returns: - dataset, problem - """ - from axolotl.utils import data_problem - from d3m.metadata.problem import TaskKeyword, PerformanceMetric - - if metric == 'F1': - performance_metrics = [{'metric': PerformanceMetric.F1, 'params': {'pos_label': '1'}}] - elif metric == 'F1_MACRO': - performance_metrics = [{'metric': PerformanceMetric.F1_MACRO, 'params': {}}] - else: - raise ValueError('The metric {} not supported.'.format(metric)) - - - dataset, problem_description = data_problem.generate_dataset_problem(df, - target_index=target_index, - task_keywords=[TaskKeyword.ANOMALY_DETECTION,], - performance_metrics=performance_metrics) - - return dataset, problem_description - -def evaluate_pipeline(problem_description, dataset, pipeline): - from axolotl.utils import schemas as schemas_utils - from axolotl.backend.simple import SimpleRunner - data_preparation_pipeline = schemas_utils.get_splitting_pipeline("TRAINING_DATA") - scoring_pipeline = schemas_utils.get_scoring_pipeline() - data_preparation_params = schemas_utils.DATA_PREPARATION_PARAMS['no_split'] - metrics = problem_description['problem']['performance_metrics'] - - backend = SimpleRunner(random_seed=0) - pipeline_result = backend.evaluate_pipeline(problem_description=problem_description, - pipeline=pipeline, - input_data=[dataset], - metrics=metrics, - data_preparation_pipeline=data_preparation_pipeline, - scoring_pipeline=scoring_pipeline, - data_preparation_params=data_preparation_params) - return pipeline_result - - diff --git a/tods/tods/__init__.py b/tods/tods/__init__.py new file mode 100644 index 0000000..16281fe --- /dev/null +++ b/tods/tods/__init__.py @@ -0,0 +1 @@ +from .utils import * diff --git a/tods/searcher/resources/default_pipeline.json b/tods/tods/resources/default_pipeline.json similarity index 100% rename from tods/searcher/resources/default_pipeline.json rename to tods/tods/resources/default_pipeline.json diff --git a/tods/searcher/schemas.py b/tods/tods/schemas.py similarity index 100% rename from tods/searcher/schemas.py rename to tods/tods/schemas.py diff --git a/tods/searcher/search/__init__.py b/tods/tods/searcher/__init__.py similarity index 100% rename from tods/searcher/search/__init__.py rename to tods/tods/searcher/__init__.py diff --git a/tods/searcher/search/brute_force_search.py b/tods/tods/searcher/brute_force_search.py similarity index 100% rename from tods/searcher/search/brute_force_search.py rename to tods/tods/searcher/brute_force_search.py diff --git a/tods/searcher/tods/utils.py b/tods/tods/utils.py similarity index 57% rename from tods/searcher/tods/utils.py rename to tods/tods/utils.py index f41bb03..14e013a 100644 --- a/tods/searcher/tods/utils.py +++ b/tods/tods/utils.py @@ -1,44 +1,82 @@ -def generate_dataset_problem(df, target_index, metric): +def load_pipeline(pipeline_path): + """Load a pipeline given a path + + Args: + pipeline_path (str): The path to a pipeline file + + Returns: + pipeline """ - A wrapper for generating dataset and problem + from axolotl.utils import pipeline as pipeline_utils + pipeline = pipeline_utils.load_pipeline(pipeline_path) + + return pipeline + +def generate_dataset(df, target_index): + """Generate dataset Args: df (pandas.DataFrame): dataset target_index (int): The column index of the target + + returns: + dataset + """ + from axolotl.utils import data_problem + dataset = data_problem.import_input_data(df, target_index=target_index) + + return dataset + +def generate_problem(dataset, metric): + """Generate dataset + + Args: + dataset: dataset metric (str): `F1` for computing F1 on label 1, 'F1_MACRO` for macro-F1 on both 0 and 1 returns: - dataset, problem + problem_description """ from axolotl.utils import data_problem from d3m.metadata.problem import TaskKeyword, PerformanceMetric - if metric == 'F1': performance_metrics = [{'metric': PerformanceMetric.F1, 'params': {'pos_label': '1'}}] elif metric == 'F1_MACRO': performance_metrics = [{'metric': PerformanceMetric.F1_MACRO, 'params': {}}] else: raise ValueError('The metric {} not supported.'.format(metric)) - - - dataset, problem_description = data_problem.generate_dataset_problem(df, - target_index=target_index, - task_keywords=[TaskKeyword.ANOMALY_DETECTION,], - performance_metrics=performance_metrics) - return dataset, problem_description + problem_description = data_problem.generate_problem_description(dataset=dataset, + task_keywords=[TaskKeyword.ANOMALY_DETECTION,], + performance_metrics=performance_metrics) + + return problem_description + +def evaluate_pipeline(dataset, pipeline, metric='F1', seed=0): + """Evaluate a Pipeline + + Args: + dataset: A dataset + pipeline: A pipeline + metric (str): `F1` for computing F1 on label 1, 'F1_MACRO` for + macro-F1 on both 0 and 1 + seed (int): A random seed -def evaluate_pipeline(problem_description, dataset, pipeline): + Returns: + pipeline_result + """ from axolotl.utils import schemas as schemas_utils from axolotl.backend.simple import SimpleRunner + + problem_description = generate_problem(dataset, metric) data_preparation_pipeline = schemas_utils.get_splitting_pipeline("TRAINING_DATA") scoring_pipeline = schemas_utils.get_scoring_pipeline() data_preparation_params = schemas_utils.DATA_PREPARATION_PARAMS['no_split'] metrics = problem_description['problem']['performance_metrics'] - backend = SimpleRunner(random_seed=0) + backend = SimpleRunner(random_seed=seed) pipeline_result = backend.evaluate_pipeline(problem_description=problem_description, pipeline=pipeline, input_data=[dataset], @@ -46,14 +84,6 @@ def evaluate_pipeline(problem_description, dataset, pipeline): data_preparation_pipeline=data_preparation_pipeline, scoring_pipeline=scoring_pipeline, data_preparation_params=data_preparation_params) - try: - for error in pipeline_result.error: - if error is not None: - raise error - except: - import traceback - traceback.print_exc() - return pipeline_result