diff --git a/axolotl/axolotl/backend/simple.py b/axolotl/axolotl/backend/simple.py index 2d6b9ad..9e058c5 100644 --- a/axolotl/axolotl/backend/simple.py +++ b/axolotl/axolotl/backend/simple.py @@ -172,6 +172,7 @@ class SimpleRunner(RunnerBase): else: pipeline_result.status = "COMPLETED" pipeline_result.scores = runtime_module.combine_folds(scores) + pipeline_result.outputs = [result.values for result in results] self.request_results[request_id] = pipeline_result return request_id diff --git a/examples/run_pipeline.py b/examples/run_pipeline.py index ae9d8ec..ea36682 100644 --- a/examples/run_pipeline.py +++ b/examples/run_pipeline.py @@ -1,11 +1,9 @@ -import pandas as pd import sys import argparse - -from searcher import schemas as schemas_utils -from searcher.utils import generate_dataset_problem, evaluate_pipeline -from axolotl.utils import pipeline as pipeline_utils import os +import pandas as pd + +from tods import generate_dataset, load_pipeline, evaluate_pipeline this_path = os.path.dirname(os.path.abspath(__file__)) #table_path = 'datasets/NAB/realTweets/labeled_Twitter_volume_IBM.csv' # The path of the dataset @@ -17,7 +15,7 @@ parser.add_argument('--target_index', type=int, default=6, help='Index of the ground truth (for evaluation)') parser.add_argument('--metric',type=str, default='F1_MACRO', help='Evaluation Metric (F1, F1_MACRO)') -parser.add_argument('--pipeline_path', default=os.path.join(this_path, '../tods/searcher/resources/default_pipeline.json'), +parser.add_argument('--pipeline_path', default=os.path.join(this_path, '../tods/tods/resources/default_pipeline.json'), help='Input the path of the pre-built pipeline description') args = parser.parse_args() @@ -27,16 +25,14 @@ target_index = args.target_index # what column is the target pipeline_path = args.pipeline_path metric = args.metric # F1 on both label 0 and 1 -time_limit = 30 # How many seconds you wanna search - -# Read data and generate dataset and problem +# Read data and generate dataset df = pd.read_csv(table_path) -dataset, problem_description = generate_dataset_problem(df, target_index=target_index, metric=metric) +dataset = generate_dataset(df, target_index) # Load the default pipeline -pipeline = pipeline_utils.load_pipeline(pipeline_path) +pipeline = load_pipeline(pipeline_path) # Run the pipeline -pipeline_result = evaluate_pipeline(problem_description, dataset, pipeline) +pipeline_result = evaluate_pipeline(dataset, pipeline, metric) print(pipeline_result) diff --git a/examples/run_automl.py b/examples/run_search.py similarity index 83% rename from examples/run_automl.py rename to examples/run_search.py index dd54b7c..2bc0c01 100644 --- a/examples/run_automl.py +++ b/examples/run_search.py @@ -2,8 +2,8 @@ import pandas as pd from axolotl.backend.simple import SimpleRunner -from searcher.utils import generate_dataset_problem -from searcher.search import BruteForceSearch +from tods import generate_dataset, generate_problem +from tods.searcher import BruteForceSearch # Some information #table_path = 'datasets/NAB/realTweets/labeled_Twitter_volume_GOOG.csv' # The path of the dataset @@ -19,13 +19,15 @@ metric = 'F1_MACRO' # F1 on both label 0 and 1 # Read data and generate dataset and problem df = pd.read_csv(table_path) -dataset, problem_description = generate_dataset_problem(df, target_index=target_index, metric=metric) +dataset = generate_dataset(df, target_index=target_index) +problem_description = generate_problem(dataset, metric) # Start backend -backend = SimpleRunner(random_seed=42) +backend = SimpleRunner(random_seed=0) # Start search algorithm -search = BruteForceSearch(problem_description=problem_description, backend=backend) +search = BruteForceSearch(problem_description=problem_description, + backend=backend) # Find the best pipeline best_runtime, best_pipeline_result = search.search_fit(input_data=[dataset], time_limit=time_limit) diff --git a/tods/searcher/__init__.py b/tods/searcher/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tods/searcher/utils.py b/tods/searcher/utils.py deleted file mode 100644 index e375354..0000000 --- a/tods/searcher/utils.py +++ /dev/null @@ -1,51 +0,0 @@ - -def generate_dataset_problem(df, target_index, metric): - """ - A wrapper for generating dataset and problem - - Args: - df (pandas.DataFrame): dataset - target_index (int): The column index of the target - metric (str): `F1` for computing F1 on label 1, 'F1_MACRO` for - macro-F1 on both 0 and 1 - - returns: - dataset, problem - """ - from axolotl.utils import data_problem - from d3m.metadata.problem import TaskKeyword, PerformanceMetric - - if metric == 'F1': - performance_metrics = [{'metric': PerformanceMetric.F1, 'params': {'pos_label': '1'}}] - elif metric == 'F1_MACRO': - performance_metrics = [{'metric': PerformanceMetric.F1_MACRO, 'params': {}}] - else: - raise ValueError('The metric {} not supported.'.format(metric)) - - - dataset, problem_description = data_problem.generate_dataset_problem(df, - target_index=target_index, - task_keywords=[TaskKeyword.ANOMALY_DETECTION,], - performance_metrics=performance_metrics) - - return dataset, problem_description - -def evaluate_pipeline(problem_description, dataset, pipeline): - from axolotl.utils import schemas as schemas_utils - from axolotl.backend.simple import SimpleRunner - data_preparation_pipeline = schemas_utils.get_splitting_pipeline("TRAINING_DATA") - scoring_pipeline = schemas_utils.get_scoring_pipeline() - data_preparation_params = schemas_utils.DATA_PREPARATION_PARAMS['no_split'] - metrics = problem_description['problem']['performance_metrics'] - - backend = SimpleRunner(random_seed=0) - pipeline_result = backend.evaluate_pipeline(problem_description=problem_description, - pipeline=pipeline, - input_data=[dataset], - metrics=metrics, - data_preparation_pipeline=data_preparation_pipeline, - scoring_pipeline=scoring_pipeline, - data_preparation_params=data_preparation_params) - return pipeline_result - - diff --git a/tods/tods/__init__.py b/tods/tods/__init__.py new file mode 100644 index 0000000..16281fe --- /dev/null +++ b/tods/tods/__init__.py @@ -0,0 +1 @@ +from .utils import * diff --git a/tods/searcher/resources/default_pipeline.json b/tods/tods/resources/default_pipeline.json similarity index 100% rename from tods/searcher/resources/default_pipeline.json rename to tods/tods/resources/default_pipeline.json diff --git a/tods/searcher/schemas.py b/tods/tods/schemas.py similarity index 100% rename from tods/searcher/schemas.py rename to tods/tods/schemas.py diff --git a/tods/searcher/search/__init__.py b/tods/tods/searcher/__init__.py similarity index 100% rename from tods/searcher/search/__init__.py rename to tods/tods/searcher/__init__.py diff --git a/tods/searcher/search/brute_force_search.py b/tods/tods/searcher/brute_force_search.py similarity index 100% rename from tods/searcher/search/brute_force_search.py rename to tods/tods/searcher/brute_force_search.py diff --git a/tods/searcher/tods/utils.py b/tods/tods/utils.py similarity index 57% rename from tods/searcher/tods/utils.py rename to tods/tods/utils.py index f41bb03..14e013a 100644 --- a/tods/searcher/tods/utils.py +++ b/tods/tods/utils.py @@ -1,44 +1,82 @@ -def generate_dataset_problem(df, target_index, metric): +def load_pipeline(pipeline_path): + """Load a pipeline given a path + + Args: + pipeline_path (str): The path to a pipeline file + + Returns: + pipeline """ - A wrapper for generating dataset and problem + from axolotl.utils import pipeline as pipeline_utils + pipeline = pipeline_utils.load_pipeline(pipeline_path) + + return pipeline + +def generate_dataset(df, target_index): + """Generate dataset Args: df (pandas.DataFrame): dataset target_index (int): The column index of the target + + returns: + dataset + """ + from axolotl.utils import data_problem + dataset = data_problem.import_input_data(df, target_index=target_index) + + return dataset + +def generate_problem(dataset, metric): + """Generate dataset + + Args: + dataset: dataset metric (str): `F1` for computing F1 on label 1, 'F1_MACRO` for macro-F1 on both 0 and 1 returns: - dataset, problem + problem_description """ from axolotl.utils import data_problem from d3m.metadata.problem import TaskKeyword, PerformanceMetric - if metric == 'F1': performance_metrics = [{'metric': PerformanceMetric.F1, 'params': {'pos_label': '1'}}] elif metric == 'F1_MACRO': performance_metrics = [{'metric': PerformanceMetric.F1_MACRO, 'params': {}}] else: raise ValueError('The metric {} not supported.'.format(metric)) - - - dataset, problem_description = data_problem.generate_dataset_problem(df, - target_index=target_index, - task_keywords=[TaskKeyword.ANOMALY_DETECTION,], - performance_metrics=performance_metrics) - return dataset, problem_description + problem_description = data_problem.generate_problem_description(dataset=dataset, + task_keywords=[TaskKeyword.ANOMALY_DETECTION,], + performance_metrics=performance_metrics) + + return problem_description + +def evaluate_pipeline(dataset, pipeline, metric='F1', seed=0): + """Evaluate a Pipeline + + Args: + dataset: A dataset + pipeline: A pipeline + metric (str): `F1` for computing F1 on label 1, 'F1_MACRO` for + macro-F1 on both 0 and 1 + seed (int): A random seed -def evaluate_pipeline(problem_description, dataset, pipeline): + Returns: + pipeline_result + """ from axolotl.utils import schemas as schemas_utils from axolotl.backend.simple import SimpleRunner + + problem_description = generate_problem(dataset, metric) data_preparation_pipeline = schemas_utils.get_splitting_pipeline("TRAINING_DATA") scoring_pipeline = schemas_utils.get_scoring_pipeline() data_preparation_params = schemas_utils.DATA_PREPARATION_PARAMS['no_split'] metrics = problem_description['problem']['performance_metrics'] - backend = SimpleRunner(random_seed=0) + backend = SimpleRunner(random_seed=seed) pipeline_result = backend.evaluate_pipeline(problem_description=problem_description, pipeline=pipeline, input_data=[dataset], @@ -46,14 +84,6 @@ def evaluate_pipeline(problem_description, dataset, pipeline): data_preparation_pipeline=data_preparation_pipeline, scoring_pipeline=scoring_pipeline, data_preparation_params=data_preparation_params) - try: - for error in pipeline_result.error: - if error is not None: - raise error - except: - import traceback - traceback.print_exc() - return pipeline_result