Browse Source

Simplify dataset interface and return results in evaluation

Former-commit-id: 7f5f4fdeff [formerly d1fbab262d] [formerly 0daaf199a4 [formerly ce0b30a55c]] [formerly 269ea6cef2 [formerly 4a0c281aaa] [formerly 5f0ee1311f [formerly 1f19e41f6e]]] [formerly 47b61676cf [formerly d170f442b4] [formerly 2d52007adf [formerly 4c76595f5d]] [formerly 3a1686558c [formerly 7d8c5f00d0] [formerly 13952f711d [formerly 40935732ed]]]] [formerly ee839034e0 [formerly a3eb29062c] [formerly 7a19be0e9b [formerly 3ad8bb47d6]] [formerly 4e64c258ed [formerly 7cab05b942] [formerly c01c4b5ab8 [formerly 8a0ffa31f8]]] [formerly e5c827f3b0 [formerly 66f6a5e882] [formerly 917b218a72 [formerly 259a497af0]] [formerly baf9af505d [formerly 0c3a6024c3] [formerly 8b317a7990 [formerly 92f001166b]]]]] [formerly 81cd5c0041 [formerly d1803c1a92] [formerly 770cfb90a2 [formerly 4ecb459345]] [formerly 11a8644180 [formerly 60943bca18] [formerly fea29803bf [formerly 42056da4f8]]] [formerly cd5cde3f12 [formerly 7e14151b0b] [formerly 9efc26abe9 [formerly 583fc82206]] [formerly 311b7db466 [formerly b73ec9c9d6] [formerly 73fcc752f6 [formerly ea6b562689]]]] [formerly 9870a026c7 [formerly d9dd3cb7ef] [formerly 58769d6ac6 [formerly 0c0373be5a]] [formerly 3c62af5a8f [formerly 5847362015] [formerly e284c3bc1b [formerly 60b26ababc]]] [formerly cd9bbd04fa [formerly 80477de5e8] [formerly 2d502d41f9 [formerly 054acf2933]] [formerly b25127cc27 [formerly 13fdd7fbe0] [formerly 3e00ce70c2 [formerly a8af4837df]]]]]]
Former-commit-id: b9cc8adf37 [formerly c5309a28ec] [formerly 8c5684d730 [formerly 4c3967fef1]] [formerly 0297be1cec [formerly c0a2675486] [formerly 9be3285b4b [formerly db084eb733]]] [formerly e2d2b8030d [formerly 4700ffd433] [formerly 69ff9f8352 [formerly 988e4501f3]] [formerly a63177b969 [formerly 012633f283] [formerly 6cfdeec2e4 [formerly fa85a63cb9]]]] [formerly 14218999e9 [formerly aebd4fd7b7] [formerly 7a94bac70d [formerly 38525ca0f6]] [formerly 9a5f056bb6 [formerly 36e0e78322] [formerly 956e79a19e [formerly 40bb07c680]]] [formerly f647735a81 [formerly 3a569a5afd] [formerly 3845b905c8 [formerly 6ac7882a40]] [formerly f3d0673d6d [formerly 1adec1f3c0] [formerly 3e00ce70c2]]]]
Former-commit-id: 9f083d9f48 [formerly 818a4e0422] [formerly e15a115ebc [formerly 7a5047d276]] [formerly 35f5b04bfe [formerly 391036b1fb] [formerly 2b02d5630e [formerly c8c40b8492]]] [formerly e59a237189 [formerly 7991a3a56e] [formerly 4e2fec0524 [formerly 93c110b8b0]] [formerly dc8c08c019 [formerly e6fa4eaa57] [formerly 98a89c9286 [formerly 86ee4f5d85]]]]
Former-commit-id: 73c340fb72 [formerly 9ea3e02483] [formerly 80681084f8 [formerly 221e36522f]] [formerly 99e0bc9871 [formerly b9937a5c3e] [formerly aa68f88105 [formerly 9575da9fc6]]]
Former-commit-id: dce6a3eae7 [formerly 386f12a53c] [formerly 7102bc3d49 [formerly e58449ea19]]
Former-commit-id: 873278d771 [formerly 9291014ea6]
Former-commit-id: cd3cf65740
master
Daochen Zha 5 years ago
parent
commit
56fb66b20a
11 changed files with 68 additions and 89 deletions
  1. +1
    -0
      axolotl/axolotl/backend/simple.py
  2. +8
    -12
      examples/run_pipeline.py
  3. +7
    -5
      examples/run_search.py
  4. +0
    -0
      tods/searcher/__init__.py
  5. +0
    -51
      tods/searcher/utils.py
  6. +1
    -0
      tods/tods/__init__.py
  7. +0
    -0
      tods/tods/resources/default_pipeline.json
  8. +0
    -0
      tods/tods/schemas.py
  9. +0
    -0
      tods/tods/searcher/__init__.py
  10. +0
    -0
      tods/tods/searcher/brute_force_search.py
  11. +51
    -21
      tods/tods/utils.py

+ 1
- 0
axolotl/axolotl/backend/simple.py View File

@@ -172,6 +172,7 @@ class SimpleRunner(RunnerBase):
else:
pipeline_result.status = "COMPLETED"
pipeline_result.scores = runtime_module.combine_folds(scores)
pipeline_result.outputs = [result.values for result in results]

self.request_results[request_id] = pipeline_result
return request_id


+ 8
- 12
examples/run_pipeline.py View File

@@ -1,11 +1,9 @@
import pandas as pd
import sys
import argparse

from searcher import schemas as schemas_utils
from searcher.utils import generate_dataset_problem, evaluate_pipeline
from axolotl.utils import pipeline as pipeline_utils
import os
import pandas as pd

from tods import generate_dataset, load_pipeline, evaluate_pipeline

this_path = os.path.dirname(os.path.abspath(__file__))
#table_path = 'datasets/NAB/realTweets/labeled_Twitter_volume_IBM.csv' # The path of the dataset
@@ -17,7 +15,7 @@ parser.add_argument('--target_index', type=int, default=6,
help='Index of the ground truth (for evaluation)')
parser.add_argument('--metric',type=str, default='F1_MACRO',
help='Evaluation Metric (F1, F1_MACRO)')
parser.add_argument('--pipeline_path', default=os.path.join(this_path, '../tods/searcher/resources/default_pipeline.json'),
parser.add_argument('--pipeline_path', default=os.path.join(this_path, '../tods/tods/resources/default_pipeline.json'),
help='Input the path of the pre-built pipeline description')

args = parser.parse_args()
@@ -27,16 +25,14 @@ target_index = args.target_index # what column is the target
pipeline_path = args.pipeline_path
metric = args.metric # F1 on both label 0 and 1

time_limit = 30 # How many seconds you wanna search

# Read data and generate dataset and problem
# Read data and generate dataset
df = pd.read_csv(table_path)
dataset, problem_description = generate_dataset_problem(df, target_index=target_index, metric=metric)
dataset = generate_dataset(df, target_index)

# Load the default pipeline
pipeline = pipeline_utils.load_pipeline(pipeline_path)
pipeline = load_pipeline(pipeline_path)

# Run the pipeline
pipeline_result = evaluate_pipeline(problem_description, dataset, pipeline)
pipeline_result = evaluate_pipeline(dataset, pipeline, metric)
print(pipeline_result)


examples/run_automl.py → examples/run_search.py View File

@@ -2,8 +2,8 @@ import pandas as pd

from axolotl.backend.simple import SimpleRunner

from searcher.utils import generate_dataset_problem
from searcher.search import BruteForceSearch
from tods import generate_dataset, generate_problem
from tods.searcher import BruteForceSearch

# Some information
#table_path = 'datasets/NAB/realTweets/labeled_Twitter_volume_GOOG.csv' # The path of the dataset
@@ -19,13 +19,15 @@ metric = 'F1_MACRO' # F1 on both label 0 and 1

# Read data and generate dataset and problem
df = pd.read_csv(table_path)
dataset, problem_description = generate_dataset_problem(df, target_index=target_index, metric=metric)
dataset = generate_dataset(df, target_index=target_index)
problem_description = generate_problem(dataset, metric)

# Start backend
backend = SimpleRunner(random_seed=42)
backend = SimpleRunner(random_seed=0)

# Start search algorithm
search = BruteForceSearch(problem_description=problem_description, backend=backend)
search = BruteForceSearch(problem_description=problem_description,
backend=backend)

# Find the best pipeline
best_runtime, best_pipeline_result = search.search_fit(input_data=[dataset], time_limit=time_limit)

+ 0
- 0
tods/searcher/__init__.py View File


+ 0
- 51
tods/searcher/utils.py View File

@@ -1,51 +0,0 @@

def generate_dataset_problem(df, target_index, metric):
"""
A wrapper for generating dataset and problem

Args:
df (pandas.DataFrame): dataset
target_index (int): The column index of the target
metric (str): `F1` for computing F1 on label 1, 'F1_MACRO` for
macro-F1 on both 0 and 1

returns:
dataset, problem
"""
from axolotl.utils import data_problem
from d3m.metadata.problem import TaskKeyword, PerformanceMetric

if metric == 'F1':
performance_metrics = [{'metric': PerformanceMetric.F1, 'params': {'pos_label': '1'}}]
elif metric == 'F1_MACRO':
performance_metrics = [{'metric': PerformanceMetric.F1_MACRO, 'params': {}}]
else:
raise ValueError('The metric {} not supported.'.format(metric))
dataset, problem_description = data_problem.generate_dataset_problem(df,
target_index=target_index,
task_keywords=[TaskKeyword.ANOMALY_DETECTION,],
performance_metrics=performance_metrics)

return dataset, problem_description

def evaluate_pipeline(problem_description, dataset, pipeline):
from axolotl.utils import schemas as schemas_utils
from axolotl.backend.simple import SimpleRunner
data_preparation_pipeline = schemas_utils.get_splitting_pipeline("TRAINING_DATA")
scoring_pipeline = schemas_utils.get_scoring_pipeline()
data_preparation_params = schemas_utils.DATA_PREPARATION_PARAMS['no_split']
metrics = problem_description['problem']['performance_metrics']

backend = SimpleRunner(random_seed=0)
pipeline_result = backend.evaluate_pipeline(problem_description=problem_description,
pipeline=pipeline,
input_data=[dataset],
metrics=metrics,
data_preparation_pipeline=data_preparation_pipeline,
scoring_pipeline=scoring_pipeline,
data_preparation_params=data_preparation_params)
return pipeline_result



+ 1
- 0
tods/tods/__init__.py View File

@@ -0,0 +1 @@
from .utils import *

tods/searcher/resources/default_pipeline.json → tods/tods/resources/default_pipeline.json View File


tods/searcher/schemas.py → tods/tods/schemas.py View File


tods/searcher/search/__init__.py → tods/tods/searcher/__init__.py View File


tods/searcher/search/brute_force_search.py → tods/tods/searcher/brute_force_search.py View File


tods/searcher/tods/utils.py → tods/tods/utils.py View File

@@ -1,44 +1,82 @@

def generate_dataset_problem(df, target_index, metric):
def load_pipeline(pipeline_path):
"""Load a pipeline given a path

Args:
pipeline_path (str): The path to a pipeline file

Returns:
pipeline
"""
A wrapper for generating dataset and problem
from axolotl.utils import pipeline as pipeline_utils
pipeline = pipeline_utils.load_pipeline(pipeline_path)

return pipeline
def generate_dataset(df, target_index):
"""Generate dataset

Args:
df (pandas.DataFrame): dataset
target_index (int): The column index of the target

returns:
dataset
"""
from axolotl.utils import data_problem
dataset = data_problem.import_input_data(df, target_index=target_index)

return dataset

def generate_problem(dataset, metric):
"""Generate dataset

Args:
dataset: dataset
metric (str): `F1` for computing F1 on label 1, 'F1_MACRO` for
macro-F1 on both 0 and 1

returns:
dataset, problem
problem_description
"""
from axolotl.utils import data_problem
from d3m.metadata.problem import TaskKeyword, PerformanceMetric

if metric == 'F1':
performance_metrics = [{'metric': PerformanceMetric.F1, 'params': {'pos_label': '1'}}]
elif metric == 'F1_MACRO':
performance_metrics = [{'metric': PerformanceMetric.F1_MACRO, 'params': {}}]
else:
raise ValueError('The metric {} not supported.'.format(metric))
dataset, problem_description = data_problem.generate_dataset_problem(df,
target_index=target_index,
task_keywords=[TaskKeyword.ANOMALY_DETECTION,],
performance_metrics=performance_metrics)

return dataset, problem_description
problem_description = data_problem.generate_problem_description(dataset=dataset,
task_keywords=[TaskKeyword.ANOMALY_DETECTION,],
performance_metrics=performance_metrics)
return problem_description

def evaluate_pipeline(dataset, pipeline, metric='F1', seed=0):
"""Evaluate a Pipeline

Args:
dataset: A dataset
pipeline: A pipeline
metric (str): `F1` for computing F1 on label 1, 'F1_MACRO` for
macro-F1 on both 0 and 1
seed (int): A random seed

def evaluate_pipeline(problem_description, dataset, pipeline):
Returns:
pipeline_result
"""
from axolotl.utils import schemas as schemas_utils
from axolotl.backend.simple import SimpleRunner

problem_description = generate_problem(dataset, metric)
data_preparation_pipeline = schemas_utils.get_splitting_pipeline("TRAINING_DATA")
scoring_pipeline = schemas_utils.get_scoring_pipeline()
data_preparation_params = schemas_utils.DATA_PREPARATION_PARAMS['no_split']
metrics = problem_description['problem']['performance_metrics']

backend = SimpleRunner(random_seed=0)
backend = SimpleRunner(random_seed=seed)
pipeline_result = backend.evaluate_pipeline(problem_description=problem_description,
pipeline=pipeline,
input_data=[dataset],
@@ -46,14 +84,6 @@ def evaluate_pipeline(problem_description, dataset, pipeline):
data_preparation_pipeline=data_preparation_pipeline,
scoring_pipeline=scoring_pipeline,
data_preparation_params=data_preparation_params)
try:
for error in pipeline_result.error:
if error is not None:
raise error
except:
import traceback
traceback.print_exc()

return pipeline_result



Loading…
Cancel
Save