Simplify dataset interface and return results in evaluation

Former-commit-id: 7f5f4fdeff [formerly d1fbab262d] [formerly 0daaf199a4 [formerly ce0b30a55c]] [formerly 269ea6cef2 [formerly 4a0c281aaa] [formerly 5f0ee1311f [formerly 1f19e41f6e]]] [formerly 47b61676cf [formerly d170f442b4] [formerly 2d52007adf [formerly 4c76595f5d]] [formerly 3a1686558c [formerly 7d8c5f00d0] [formerly 13952f711d [formerly 40935732ed]]]] [formerly ee839034e0 [formerly a3eb29062c] [formerly 7a19be0e9b [formerly 3ad8bb47d6]] [formerly 4e64c258ed [formerly 7cab05b942] [formerly c01c4b5ab8 [formerly 8a0ffa31f8]]] [formerly e5c827f3b0 [formerly 66f6a5e882] [formerly 917b218a72 [formerly 259a497af0]] [formerly baf9af505d [formerly 0c3a6024c3] [formerly 8b317a7990 [formerly 92f001166b]]]]] [formerly 81cd5c0041 [formerly d1803c1a92] [formerly 770cfb90a2 [formerly 4ecb459345]] [formerly 11a8644180 [formerly 60943bca18] [formerly fea29803bf [formerly 42056da4f8]]] [formerly cd5cde3f12 [formerly 7e14151b0b] [formerly 9efc26abe9 [formerly 583fc82206]] [formerly 311b7db466 [formerly b73ec9c9d6] [formerly 73fcc752f6 [formerly ea6b562689]]]] [formerly 9870a026c7 [formerly d9dd3cb7ef] [formerly 58769d6ac6 [formerly 0c0373be5a]] [formerly 3c62af5a8f [formerly 5847362015] [formerly e284c3bc1b [formerly 60b26ababc]]] [formerly cd9bbd04fa [formerly 80477de5e8] [formerly 2d502d41f9 [formerly 054acf2933]] [formerly b25127cc27 [formerly 13fdd7fbe0] [formerly 3e00ce70c2 [formerly a8af4837df]]]]]] Former-commit-id: b9cc8adf37 [formerly c5309a28ec] [formerly 8c5684d730 [formerly 4c3967fef1]] [formerly 0297be1cec [formerly c0a2675486] [formerly 9be3285b4b [formerly db084eb733]]] [formerly e2d2b8030d [formerly 4700ffd433] [formerly 69ff9f8352 [formerly 988e4501f3]] [formerly a63177b969 [formerly 012633f283] [formerly 6cfdeec2e4 [formerly fa85a63cb9]]]] [formerly 14218999e9 [formerly aebd4fd7b7] [formerly 7a94bac70d [formerly 38525ca0f6]] [formerly 9a5f056bb6 [formerly 36e0e78322] [formerly 956e79a19e [formerly 40bb07c680]]] [formerly f647735a81 [formerly 3a569a5afd] [formerly 3845b905c8 [formerly 6ac7882a40]] [formerly f3d0673d6d [formerly 1adec1f3c0] [formerly 3e00ce70c2]]]] Former-commit-id: 9f083d9f48 [formerly 818a4e0422] [formerly e15a115ebc [formerly 7a5047d276]] [formerly 35f5b04bfe [formerly 391036b1fb] [formerly 2b02d5630e [formerly c8c40b8492]]] [formerly e59a237189 [formerly 7991a3a56e] [formerly 4e2fec0524 [formerly 93c110b8b0]] [formerly dc8c08c019 [formerly e6fa4eaa57] [formerly 98a89c9286 [formerly 86ee4f5d85]]]] Former-commit-id: 73c340fb72 [formerly 9ea3e02483] [formerly 80681084f8 [formerly 221e36522f]] [formerly 99e0bc9871 [formerly b9937a5c3e] [formerly aa68f88105 [formerly 9575da9fc6]]] Former-commit-id: dce6a3eae7 [formerly 386f12a53c] [formerly 7102bc3d49 [formerly e58449ea19]] Former-commit-id: 873278d771 [formerly 9291014ea6] Former-commit-id: cd3cf65740
5 years ago · 56fb66b20a
--- a/axolotl/axolotl/backend/simple.py
+++ b/axolotl/axolotl/backend/simple.py
@@ -172,6 +172,7 @@ class SimpleRunner(RunnerBase):
        else:
            pipeline_result.status = "COMPLETED"
            pipeline_result.scores = runtime_module.combine_folds(scores)
            pipeline_result.outputs = [result.values for result in results]

        self.request_results[request_id] = pipeline_result
        return request_id
--- a/examples/run_pipeline.py
+++ b/examples/run_pipeline.py
@@ -1,11 +1,9 @@
 import pandas as pd
 import sys
 import argparse

 from searcher import schemas as schemas_utils
 from searcher.utils import generate_dataset_problem, evaluate_pipeline
 from axolotl.utils import pipeline as pipeline_utils
 import os
 import pandas as pd

 from tods import generate_dataset, load_pipeline, evaluate_pipeline

 this_path = os.path.dirname(os.path.abspath(__file__))
 #table_path = 'datasets/NAB/realTweets/labeled_Twitter_volume_IBM.csv' # The path of the dataset
@@ -17,7 +15,7 @@ parser.add_argument('--target_index', type=int, default=6,
                    help='Index of the ground truth (for evaluation)')
 parser.add_argument('--metric',type=str, default='F1_MACRO',
                    help='Evaluation Metric (F1, F1_MACRO)')
 parser.add_argument('--pipeline_path', default=os.path.join(this_path, '../tods/searcher/resources/default_pipeline.json'),
 parser.add_argument('--pipeline_path', default=os.path.join(this_path, '../tods/tods/resources/default_pipeline.json'),
                    help='Input the path of the pre-built pipeline description')

 args = parser.parse_args()
@@ -27,16 +25,14 @@ target_index = args.target_index # what column is the target
 pipeline_path = args.pipeline_path
 metric = args.metric # F1 on both label 0 and 1

 time_limit = 30 # How many seconds you wanna search

 # Read data and generate dataset and problem
 # Read data and generate dataset
 df = pd.read_csv(table_path)
 dataset, problem_description = generate_dataset_problem(df, target_index=target_index, metric=metric)
 dataset = generate_dataset(df, target_index)

 # Load the default pipeline
 pipeline = pipeline_utils.load_pipeline(pipeline_path)
 pipeline = load_pipeline(pipeline_path)

 # Run the pipeline
 pipeline_result = evaluate_pipeline(problem_description, dataset, pipeline)
 pipeline_result = evaluate_pipeline(dataset, pipeline, metric)
 print(pipeline_result)

--- a/examples/run_search.py
+++ b/examples/run_search.py
@@ -2,8 +2,8 @@ import pandas as pd

 from axolotl.backend.simple import SimpleRunner

 from searcher.utils import generate_dataset_problem
 from searcher.search import BruteForceSearch
 from tods import generate_dataset, generate_problem
 from tods.searcher import BruteForceSearch

 # Some information
 #table_path = 'datasets/NAB/realTweets/labeled_Twitter_volume_GOOG.csv' # The path of the dataset
@@ -19,13 +19,15 @@ metric = 'F1_MACRO' # F1 on both label 0 and 1

 # Read data and generate dataset and problem
 df = pd.read_csv(table_path)
 dataset, problem_description = generate_dataset_problem(df, target_index=target_index, metric=metric)
 dataset = generate_dataset(df, target_index=target_index)
 problem_description = generate_problem(dataset, metric)

 # Start backend
 backend = SimpleRunner(random_seed=42)
 backend = SimpleRunner(random_seed=0)

 # Start search algorithm
 search = BruteForceSearch(problem_description=problem_description, backend=backend)
 search = BruteForceSearch(problem_description=problem_description,
                          backend=backend)

 # Find the best pipeline
 best_runtime, best_pipeline_result = search.search_fit(input_data=[dataset], time_limit=time_limit)
--- a/tods/searcher/init.py
+++ b/tods/searcher/init.py
--- a/tods/searcher/utils.py
+++ b/tods/searcher/utils.py
@@ -1,51 +0,0 @@

 def generate_dataset_problem(df, target_index, metric):
    """
    A wrapper for generating dataset and problem

    Args:
        df (pandas.DataFrame): dataset
        target_index (int): The column index of the target
        metric (str): `F1` for computing F1 on label 1, 'F1_MACRO` for 
            macro-F1 on both 0 and 1

    returns:
        dataset, problem
    """
    from axolotl.utils import data_problem
    from d3m.metadata.problem import TaskKeyword, PerformanceMetric

    if metric == 'F1':
        performance_metrics = [{'metric': PerformanceMetric.F1, 'params': {'pos_label': '1'}}]
    elif metric == 'F1_MACRO':
        performance_metrics = [{'metric': PerformanceMetric.F1_MACRO, 'params': {}}]
    else:
        raise ValueError('The metric {} not supported.'.format(metric))
        
        
    dataset, problem_description = data_problem.generate_dataset_problem(df,
                                                                         target_index=target_index,
                                                                         task_keywords=[TaskKeyword.ANOMALY_DETECTION,],
                                                                         performance_metrics=performance_metrics)

    return dataset, problem_description

 def evaluate_pipeline(problem_description, dataset, pipeline):
    from axolotl.utils import schemas as schemas_utils
    from axolotl.backend.simple import SimpleRunner
    data_preparation_pipeline = schemas_utils.get_splitting_pipeline("TRAINING_DATA")
    scoring_pipeline = schemas_utils.get_scoring_pipeline()
    data_preparation_params = schemas_utils.DATA_PREPARATION_PARAMS['no_split']
    metrics = problem_description['problem']['performance_metrics']

    backend = SimpleRunner(random_seed=0) 
    pipeline_result = backend.evaluate_pipeline(problem_description=problem_description,
                                                pipeline=pipeline,
                                                input_data=[dataset],
                                                metrics=metrics,
                                                data_preparation_pipeline=data_preparation_pipeline,
                                                scoring_pipeline=scoring_pipeline,
                                                data_preparation_params=data_preparation_params)
    return pipeline_result


--- a/tods/tods/init.py
+++ b/tods/tods/init.py
@@ -0,0 +1 @@
 from .utils import *
--- a/tods/searcher/resources/default_pipeline.json
+++ b/tods/searcher/resources/default_pipeline.json
--- a/tods/searcher/schemas.py
+++ b/tods/searcher/schemas.py
--- a/tods/searcher/search/init.py
+++ b/tods/searcher/search/init.py
--- a/tods/searcher/search/brute_force_search.py
+++ b/tods/searcher/search/brute_force_search.py
--- a/tods/searcher/tods/utils.py
+++ b/tods/searcher/tods/utils.py
@@ -1,44 +1,82 @@

 def generate_dataset_problem(df, target_index, metric):
 def load_pipeline(pipeline_path):
    """Load a pipeline given a path

    Args:
        pipeline_path (str): The path to a pipeline file

    Returns:
        pipeline
    """
    A wrapper for generating dataset and problem
    from axolotl.utils import pipeline as pipeline_utils
    pipeline = pipeline_utils.load_pipeline(pipeline_path)

    return pipeline
    
 def generate_dataset(df, target_index):
    """Generate dataset

    Args:
        df (pandas.DataFrame): dataset
        target_index (int): The column index of the target

    returns:
        dataset
    """
    from axolotl.utils import data_problem
    dataset = data_problem.import_input_data(df, target_index=target_index)

    return dataset

 def generate_problem(dataset, metric):
    """Generate dataset

    Args:
        dataset: dataset
        metric (str): `F1` for computing F1 on label 1, 'F1_MACRO` for 
            macro-F1 on both 0 and 1

    returns:
        dataset, problem
        problem_description
    """
    from axolotl.utils import data_problem
    from d3m.metadata.problem import TaskKeyword, PerformanceMetric

    if metric == 'F1':
        performance_metrics = [{'metric': PerformanceMetric.F1, 'params': {'pos_label': '1'}}]
    elif metric == 'F1_MACRO':
        performance_metrics = [{'metric': PerformanceMetric.F1_MACRO, 'params': {}}]
    else:
        raise ValueError('The metric {} not supported.'.format(metric))
        
        
    dataset, problem_description = data_problem.generate_dataset_problem(df,
                                                                         target_index=target_index,
                                                                         task_keywords=[TaskKeyword.ANOMALY_DETECTION,],
                                                                         performance_metrics=performance_metrics)

    return dataset, problem_description
    problem_description = data_problem.generate_problem_description(dataset=dataset, 
                                                                    task_keywords=[TaskKeyword.ANOMALY_DETECTION,],
                                                                    performance_metrics=performance_metrics)
    
    return problem_description

 def evaluate_pipeline(dataset, pipeline, metric='F1', seed=0):
    """Evaluate a Pipeline

    Args:
        dataset: A dataset
        pipeline: A pipeline
        metric (str): `F1` for computing F1 on label 1, 'F1_MACRO` for 
            macro-F1 on both 0 and 1
        seed (int): A random seed

 def evaluate_pipeline(problem_description, dataset, pipeline):
    Returns:
        pipeline_result
    """
    from axolotl.utils import schemas as schemas_utils
    from axolotl.backend.simple import SimpleRunner

    problem_description = generate_problem(dataset, metric)
    data_preparation_pipeline = schemas_utils.get_splitting_pipeline("TRAINING_DATA")
    scoring_pipeline = schemas_utils.get_scoring_pipeline()
    data_preparation_params = schemas_utils.DATA_PREPARATION_PARAMS['no_split']
    metrics = problem_description['problem']['performance_metrics']

    backend = SimpleRunner(random_seed=0) 
    backend = SimpleRunner(random_seed=seed) 
    pipeline_result = backend.evaluate_pipeline(problem_description=problem_description,
                                                pipeline=pipeline,
                                                input_data=[dataset],
@@ -46,14 +84,6 @@ def evaluate_pipeline(problem_description, dataset, pipeline):
                                                data_preparation_pipeline=data_preparation_pipeline,
                                                scoring_pipeline=scoring_pipeline,
                                                data_preparation_params=data_preparation_params)
    try:
        for error in pipeline_result.error:
            if error is not None:
                raise error
    except:
        import traceback
        traceback.print_exc()

    return pipeline_result