|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680 |
- import contextlib
- import json
- import gzip
- import io
- import logging
- import os.path
- import pickle
- import random
- import shutil
- import sys
- import tempfile
- import traceback
- import unittest
-
- import pandas
-
- COMMON_PRIMITIVES_DIR = os.path.join(os.path.dirname(__file__), 'common-primitives')
- # NOTE: This insertion should appear before any code attempting to resolve or load primitives,
- # so the git submodule version of `common-primitives` is looked at first.
- sys.path.insert(0, COMMON_PRIMITIVES_DIR)
-
- TEST_PRIMITIVES_DIR = os.path.join(os.path.dirname(__file__), 'data', 'primitives')
- sys.path.insert(0, TEST_PRIMITIVES_DIR)
-
- from common_primitives.column_parser import ColumnParserPrimitive
- from common_primitives.construct_predictions import ConstructPredictionsPrimitive
- from common_primitives.dataset_to_dataframe import DatasetToDataFramePrimitive
- from common_primitives.no_split import NoSplitDatasetSplitPrimitive
- from common_primitives.random_forest import RandomForestClassifierPrimitive
- from common_primitives.train_score_split import TrainScoreDatasetSplitPrimitive
-
-
- from test_primitives.random_classifier import RandomClassifierPrimitive
- from test_primitives.fake_score import FakeScorePrimitive
-
- from d3m import cli, index, runtime, utils
- from d3m.container import dataset as dataset_module
- from d3m.contrib.primitives.compute_scores import ComputeScoresPrimitive
- from d3m.metadata import base as metadata_base, pipeline as pipeline_module, pipeline_run as pipeline_run_module, problem as problem_module
-
- TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), 'data')
- PROBLEM_DIR = os.path.join(TEST_DATA_DIR, 'problems')
- DATASET_DIR = os.path.join(TEST_DATA_DIR, 'datasets')
- PIPELINE_DIR = os.path.join(TEST_DATA_DIR, 'pipelines')
-
-
- class TestCLIRuntime(unittest.TestCase):
- def setUp(self):
- self.test_dir = tempfile.mkdtemp()
-
- def tearDown(self):
- shutil.rmtree(self.test_dir)
-
- @classmethod
- def setUpClass(cls):
- to_register = {
- 'd3m.primitives.data_transformation.dataset_to_dataframe.Common': DatasetToDataFramePrimitive,
- 'd3m.primitives.classification.random_forest.Common': RandomForestClassifierPrimitive,
- 'd3m.primitives.classification.random_classifier.Test': RandomClassifierPrimitive,
- 'd3m.primitives.data_transformation.column_parser.Common': ColumnParserPrimitive,
- 'd3m.primitives.data_transformation.construct_predictions.Common': ConstructPredictionsPrimitive,
- 'd3m.primitives.evaluation.no_split_dataset_split.Common': NoSplitDatasetSplitPrimitive,
- 'd3m.primitives.evaluation.compute_scores.Test': FakeScorePrimitive,
- 'd3m.primitives.evaluation.train_score_dataset_split.Common': TrainScoreDatasetSplitPrimitive,
- # We do not have to load this primitive, but loading it here prevents the package from loading all primitives.
- 'd3m.primitives.evaluation.compute_scores.Core': ComputeScoresPrimitive,
- }
-
- # To hide any logging or stdout output.
- with utils.silence():
- for python_path, primitive in to_register.items():
- index.register_primitive(python_path, primitive)
-
- def _call_cli_runtime(self, arg):
- logger = logging.getLogger('d3m.runtime')
- with utils.silence():
- with self.assertLogs(logger=logger) as cm:
- # So that at least one message is logged.
- logger.warning("Debugging.")
- cli.main(arg)
- # We skip our "debugging" message.
- return cm.records[1:]
-
- def _call_cli_runtime_without_fail(self, arg):
- try:
- return self._call_cli_runtime(arg)
- except Exception as e:
- self.fail(traceback.format_exc())
-
- def _assert_valid_saved_pipeline_runs(self, pipeline_run_save_path):
- with open(pipeline_run_save_path, 'r') as f:
- for pipeline_run_dict in list(utils.yaml_load_all(f)):
- try:
- pipeline_run_module.validate_pipeline_run(pipeline_run_dict)
- except Exception as e:
- self.fail(traceback.format_exc())
-
- def _validate_previous_pipeline_run_ids(self, pipeline_run_save_path):
- ids = set()
- prev_ids = set()
- with open(pipeline_run_save_path, 'r') as f:
- for pipeline_run_dict in list(utils.yaml_load_all(f)):
- ids.add(pipeline_run_dict['id'])
- if 'previous_pipeline_run' in pipeline_run_dict:
- prev_ids.add(pipeline_run_dict['previous_pipeline_run']['id'])
- self.assertTrue(
- prev_ids.issubset(ids),
- 'Some previous pipeline run ids {} are not in the set of pipeline run ids {}'.format(prev_ids, ids)
- )
-
- def test_fit_multi_input(self):
- pipeline_run_save_path = os.path.join(self.test_dir, 'pipeline_run.yml')
- arg = [
- '',
- 'runtime',
- 'fit',
- '--input',
- os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'),
- '--input',
- os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'),
- '--problem',
- os.path.join(PROBLEM_DIR, 'iris_problem_1/problemDoc.json'),
- '--pipeline',
- os.path.join(PIPELINE_DIR, 'multi-input-test.json'),
- '--expose-produced-outputs',
- self.test_dir,
- '-O',
- pipeline_run_save_path,
- ]
- self._call_cli_runtime_without_fail(arg)
-
- self._assert_valid_saved_pipeline_runs(pipeline_run_save_path)
-
- self._assert_standard_output_metadata()
-
- def test_fit_without_problem(self):
- pipeline_run_save_path = os.path.join(self.test_dir, 'pipeline_run.yml')
- fitted_pipeline_path = os.path.join(self.test_dir, 'fitted-pipeline')
- output_csv_path = os.path.join(self.test_dir, 'output.csv')
- arg = [
- '',
- 'runtime',
- 'fit',
- '--input',
- os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'),
- '--input',
- os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'),
- '--pipeline',
- os.path.join(PIPELINE_DIR, 'multi-input-test.json'),
- '--save',
- fitted_pipeline_path,
- '--expose-produced-outputs',
- self.test_dir,
- '--output',
- output_csv_path,
- '-O',
- pipeline_run_save_path,
- ]
- self._call_cli_runtime_without_fail(arg)
-
- self.assertEqual(utils.list_files(self.test_dir), [
- 'fitted-pipeline',
- 'output.csv',
- 'outputs.0/data.csv',
- 'outputs.0/metadata.json',
- 'pipeline_run.yml',
- 'steps.0.produce/data.csv',
- 'steps.0.produce/metadata.json',
- 'steps.1.produce/data.csv',
- 'steps.1.produce/metadata.json',
- 'steps.2.produce/data.csv',
- 'steps.2.produce/metadata.json'
- ])
-
- self._assert_valid_saved_pipeline_runs(pipeline_run_save_path)
-
- self._assert_standard_output_metadata()
- self._assert_prediction_sum(prediction_sum=11225, outputs_path='outputs.0/data.csv')
- self._assert_prediction_sum(prediction_sum=11225, outputs_path='output.csv')
-
- def test_produce_without_problem(self):
- pipeline_run_save_path = os.path.join(self.test_dir, 'pipeline_run.yml')
- fitted_pipeline_path = os.path.join(self.test_dir, 'fitted-no-problem-pipeline')
- output_csv_path = os.path.join(self.test_dir, 'output.csv')
- arg = [
- '',
- 'runtime',
- 'fit',
- '--input',
- os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'),
- '--input',
- os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'),
- '--pipeline',
- os.path.join(PIPELINE_DIR, 'multi-input-test.json'),
- '--save',
- fitted_pipeline_path,
- ]
- self._call_cli_runtime_without_fail(arg)
-
- arg = [
- '',
- 'runtime',
- 'produce',
- '--test-input',
- os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'),
- '--test-input',
- os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'),
- '--output',
- output_csv_path,
- '--fitted-pipeline',
- fitted_pipeline_path,
- '--expose-produced-outputs',
- self.test_dir,
- '-O',
- pipeline_run_save_path,
- ]
- self._call_cli_runtime_without_fail(arg)
-
- self.assertEqual(utils.list_files(self.test_dir), [
- 'fitted-no-problem-pipeline',
- 'output.csv',
- 'outputs.0/data.csv',
- 'outputs.0/metadata.json',
- 'pipeline_run.yml',
- 'steps.0.produce/data.csv',
- 'steps.0.produce/metadata.json',
- 'steps.1.produce/data.csv',
- 'steps.1.produce/metadata.json',
- 'steps.2.produce/data.csv',
- 'steps.2.produce/metadata.json'
- ])
-
- self._assert_valid_saved_pipeline_runs(pipeline_run_save_path)
-
- self._assert_standard_output_metadata()
- self._assert_prediction_sum(prediction_sum=11008, outputs_path='outputs.0/data.csv')
- self._assert_prediction_sum(prediction_sum=11008, outputs_path='output.csv')
-
- def test_fit_produce_without_problem(self):
- pipeline_run_save_path = os.path.join(self.test_dir, 'pipeline_run.yml')
- output_csv_path = os.path.join(self.test_dir, 'output.csv')
- arg = [
- '',
- 'runtime',
- 'fit-produce',
- '--input',
- os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'),
- '--input',
- os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'),
- '--test-input',
- os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'),
- '--test-input',
- os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'),
- '--pipeline',
- os.path.join(PIPELINE_DIR, 'multi-input-test.json'),
- '--output',
- output_csv_path,
- '--expose-produced-outputs',
- self.test_dir,
- '-O',
- pipeline_run_save_path,
- ]
- self._call_cli_runtime_without_fail(arg)
-
- self.assertEqual(utils.list_files(self.test_dir), [
- 'output.csv',
- 'outputs.0/data.csv',
- 'outputs.0/metadata.json',
- 'pipeline_run.yml',
- 'steps.0.produce/data.csv',
- 'steps.0.produce/metadata.json',
- 'steps.1.produce/data.csv',
- 'steps.1.produce/metadata.json',
- 'steps.2.produce/data.csv',
- 'steps.2.produce/metadata.json'
- ])
-
- self._assert_valid_saved_pipeline_runs(pipeline_run_save_path)
- self._validate_previous_pipeline_run_ids(pipeline_run_save_path)
- self._assert_standard_output_metadata()
- self._assert_prediction_sum(prediction_sum=11008, outputs_path='outputs.0/data.csv')
- self._assert_prediction_sum(prediction_sum=11008, outputs_path='output.csv')
-
- def test_nonstandard_fit_without_problem(self):
- pipeline_run_save_path = os.path.join(self.test_dir, 'pipeline_run.yml')
- fitted_pipeline_path = os.path.join(self.test_dir, 'fitted-pipeline')
- arg = [
- '',
- 'runtime',
- 'fit',
- '--input',
- os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'),
- '--pipeline',
- os.path.join(PIPELINE_DIR, 'semi-standard-pipeline.json'),
- '--save',
- fitted_pipeline_path,
- '--expose-produced-outputs',
- self.test_dir,
- '--not-standard-pipeline',
- '-O',
- pipeline_run_save_path,
- ]
- self._call_cli_runtime_without_fail(arg)
-
- self.assertEqual(utils.list_files(self.test_dir), [
- 'fitted-pipeline',
- 'outputs.0/data.csv',
- 'outputs.0/metadata.json',
- 'outputs.1/data.csv',
- 'outputs.1/metadata.json',
- 'pipeline_run.yml',
- 'steps.0.produce/data.csv',
- 'steps.0.produce/metadata.json',
- 'steps.1.produce/data.csv',
- 'steps.1.produce/metadata.json',
- ])
-
- self._assert_valid_saved_pipeline_runs(pipeline_run_save_path)
-
- self._assert_standard_output_metadata()
- self._assert_prediction_sum(prediction_sum=10710, outputs_path='outputs.0/data.csv')
- self._assert_nonstandard_output(outputs_name='outputs.1')
-
- def test_nonstandard_produce_without_problem(self):
- pipeline_run_save_path = os.path.join(self.test_dir, 'pipeline_run.yml')
- fitted_pipeline_path = os.path.join(self.test_dir, 'fitted-pipeline')
- arg = [
- '',
- 'runtime',
- 'fit',
- '--input',
- os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'),
- '--pipeline',
- os.path.join(PIPELINE_DIR, 'semi-standard-pipeline.json'),
- '--save',
- fitted_pipeline_path,
- '--not-standard-pipeline'
- ]
- self._call_cli_runtime_without_fail(arg)
-
- arg = [
- '',
- 'runtime',
- 'produce',
- '--test-input',
- os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'),
- '--fitted-pipeline',
- fitted_pipeline_path,
- '--expose-produced-outputs',
- self.test_dir,
- '-O',
- pipeline_run_save_path,
- ]
- self._call_cli_runtime_without_fail(arg)
-
- self.assertEqual(utils.list_files(self.test_dir), [
- 'fitted-pipeline',
- 'outputs.0/data.csv',
- 'outputs.0/metadata.json',
- 'outputs.1/data.csv',
- 'outputs.1/metadata.json',
- 'pipeline_run.yml',
- 'steps.0.produce/data.csv',
- 'steps.0.produce/metadata.json',
- 'steps.1.produce/data.csv',
- 'steps.1.produce/metadata.json'
- ])
-
- self._assert_valid_saved_pipeline_runs(pipeline_run_save_path)
-
- self._assert_standard_output_metadata()
- self._assert_prediction_sum(prediction_sum=12106, outputs_path='outputs.0/data.csv')
- self._assert_nonstandard_output(outputs_name='outputs.1')
-
- def test_nonstandard_fit_produce_without_problem(self):
- pipeline_run_save_path = os.path.join(self.test_dir, 'pipeline_run.yml')
- arg = [
- '',
- 'runtime',
- 'fit-produce',
- '--input',
- os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'),
- '--test-input',
- os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'),
- '--pipeline',
- os.path.join(PIPELINE_DIR, 'semi-standard-pipeline.json'),
- '--expose-produced-outputs',
- self.test_dir,
- '--not-standard-pipeline',
- '-O',
- pipeline_run_save_path,
- ]
- self._call_cli_runtime_without_fail(arg)
-
- self.assertEqual(utils.list_files(self.test_dir), [
- 'outputs.0/data.csv',
- 'outputs.0/metadata.json',
- 'outputs.1/data.csv',
- 'outputs.1/metadata.json',
- 'pipeline_run.yml',
- 'steps.0.produce/data.csv',
- 'steps.0.produce/metadata.json',
- 'steps.1.produce/data.csv',
- 'steps.1.produce/metadata.json',
- ])
-
- self._assert_valid_saved_pipeline_runs(pipeline_run_save_path)
- self._validate_previous_pipeline_run_ids(pipeline_run_save_path)
- self._assert_standard_output_metadata()
- self._assert_prediction_sum(prediction_sum=12106, outputs_path='outputs.0/data.csv')
- self._assert_nonstandard_output(outputs_name='outputs.1')
-
- def test_fit_produce_multi_input(self):
- pipeline_run_save_path = os.path.join(self.test_dir, 'pipeline_run.yml')
- arg = [
- '',
- 'runtime',
- 'fit-produce',
- '--input',
- os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'),
- '--input',
- os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'),
- '--problem',
- os.path.join(PROBLEM_DIR, 'iris_problem_1/problemDoc.json'),
- '--test-input',
- os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'),
- '--test-input',
- os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'),
- '--pipeline',
- os.path.join(PIPELINE_DIR, 'multi-input-test.json'),
- '--expose-produced-outputs',
- self.test_dir,
- '-O',
- pipeline_run_save_path,
- ]
- self._call_cli_runtime_without_fail(arg)
-
- self.assertEqual(utils.list_files(self.test_dir), [
- 'outputs.0/data.csv',
- 'outputs.0/metadata.json',
- 'pipeline_run.yml',
- 'steps.0.produce/data.csv',
- 'steps.0.produce/metadata.json',
- 'steps.1.produce/data.csv',
- 'steps.1.produce/metadata.json',
- 'steps.2.produce/data.csv',
- 'steps.2.produce/metadata.json',
- ])
-
- self._assert_valid_saved_pipeline_runs(pipeline_run_save_path)
- self._validate_previous_pipeline_run_ids(pipeline_run_save_path)
- self._assert_standard_output_metadata()
- self._assert_prediction_sum(prediction_sum=11008, outputs_path='outputs.0/data.csv')
-
- def test_fit_score(self):
- pipeline_run_save_path = os.path.join(self.test_dir, 'pipeline_run.yml')
- arg = [
- '',
- 'runtime',
- 'fit-score',
- '--input',
- os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'),
- '--problem',
- os.path.join(PROBLEM_DIR, 'iris_problem_1/problemDoc.json'),
- '--test-input',
- os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'),
- '--score-input',
- os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'),
- '--pipeline',
- os.path.join(PIPELINE_DIR, 'random-forest-classifier.yml'),
- '--scores',
- os.path.join(self.test_dir, 'scores.csv'),
- '-O',
- pipeline_run_save_path,
- ]
- self._call_cli_runtime_without_fail(arg)
-
- self._assert_valid_saved_pipeline_runs(pipeline_run_save_path)
- self._validate_previous_pipeline_run_ids(pipeline_run_save_path)
-
- dataframe = pandas.read_csv(os.path.join(self.test_dir, 'scores.csv'))
- self.assertEqual(list(dataframe.columns), ['metric', 'value', 'normalized', 'randomSeed'])
- self.assertEqual(dataframe.values.tolist(), [['ACCURACY', 1.0, 1.0, 0]])
-
- def test_fit_score_without_problem(self):
- pipeline_run_save_path = os.path.join(self.test_dir, 'pipeline_run.yml')
- arg = [
- '',
- 'runtime',
- 'fit-score',
- '--input',
- os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'),
- '--test-input',
- os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'),
- '--score-input',
- os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'),
- '--pipeline',
- os.path.join(PIPELINE_DIR, 'random-classifier.yml'),
- '--scoring-pipeline',
- os.path.join(PIPELINE_DIR, 'fake_compute_score.yml'),
- # this argument has no effect
- '--metric',
- 'F1_MACRO',
- '--metric',
- 'ACCURACY',
- '--scores',
- os.path.join(self.test_dir, 'scores.csv'),
- '-O',
- pipeline_run_save_path,
- ]
- logging_records = self._call_cli_runtime_without_fail(arg)
-
- self.assertEqual(len(logging_records), 1)
- self.assertEqual(logging_records[0].msg, "Not all provided hyper-parameters for the scoring pipeline %(pipeline_id)s were used: %(unused_params)s")
-
- self._assert_valid_saved_pipeline_runs(pipeline_run_save_path)
- self._validate_previous_pipeline_run_ids(pipeline_run_save_path)
-
- dataframe = pandas.read_csv(os.path.join(self.test_dir, 'scores.csv'))
- self.assertEqual(list(dataframe.columns), ['metric', 'value', 'normalized', 'randomSeed'])
- self.assertEqual(dataframe.values.tolist(), [['ACCURACY', 1.0, 1.0, 0]])
-
- @staticmethod
- def _get_iris_dataset_path():
- return os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json')
-
- @staticmethod
- def _get_iris_problem_path():
- return os.path.join(PROBLEM_DIR, 'iris_problem_1/problemDoc.json')
-
- @staticmethod
- def _get_random_forest_pipeline_path():
- return os.path.join(PIPELINE_DIR, 'random-forest-classifier.yml')
-
- @staticmethod
- def _get_no_split_data_pipeline_path():
- return os.path.join(PIPELINE_DIR, 'data-preparation-no-split.yml')
-
- @staticmethod
- def _get_train_test_split_data_pipeline_path():
- return os.path.join(PIPELINE_DIR, 'data-preparation-train-test-split.yml')
-
- def _get_pipeline_run_save_path(self):
- return os.path.join(self.test_dir, 'pipeline_run.yml')
-
- def _get_predictions_path(self):
- return os.path.join(self.test_dir, 'predictions.csv')
-
- def _get_scores_path(self):
- return os.path.join(self.test_dir, 'scores.csv')
-
- def _get_pipeline_rerun_save_path(self):
- return os.path.join(self.test_dir, 'pipeline_rerun.yml')
-
- def _get_rescores_path(self):
- return os.path.join(self.test_dir, 'rescores.csv')
-
- def _fit_iris_random_forest(
- self, *, predictions_path=None, fitted_pipeline_path=None, pipeline_run_save_path=None
- ):
- if pipeline_run_save_path is None:
- pipeline_run_save_path = self._get_pipeline_run_save_path()
- arg = [
- '',
- 'runtime',
- 'fit',
- '--input',
- self._get_iris_dataset_path(),
- '--problem',
- self._get_iris_problem_path(),
- '--pipeline',
- self._get_random_forest_pipeline_path(),
- '-O',
- pipeline_run_save_path
- ]
- if predictions_path is not None:
- arg.append('--output')
- arg.append(predictions_path)
- if fitted_pipeline_path is not None:
- arg.append('--save')
- arg.append(fitted_pipeline_path)
-
- self._call_cli_runtime_without_fail(arg)
-
- def _fit_iris_random_classifier_without_problem(self, *, fitted_pipeline_path):
- pipeline_run_save_path = os.path.join(self.test_dir, 'pipeline_run.yml')
- arg = [
- '',
- 'runtime',
- 'fit',
- '--input',
- os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'),
- '--pipeline',
- os.path.join(PIPELINE_DIR, 'random-classifier.yml'),
- '-O',
- pipeline_run_save_path
- ]
- if fitted_pipeline_path is not None:
- arg.append('--save')
- arg.append(fitted_pipeline_path)
-
- self._call_cli_runtime_without_fail(arg)
-
- def test_fit(self):
- pipeline_run_save_path = self._get_pipeline_run_save_path()
- fitted_pipeline_path = os.path.join(self.test_dir, 'fitted-pipeline')
- self._fit_iris_random_forest(
- fitted_pipeline_path=fitted_pipeline_path, pipeline_run_save_path=pipeline_run_save_path
- )
-
- self._assert_valid_saved_pipeline_runs(pipeline_run_save_path)
-
- self.assertTrue(os.path.isfile(fitted_pipeline_path))
- self.assertTrue(os.path.isfile(pipeline_run_save_path))
-
- def test_evaluate(self):
- pipeline_run_save_path = os.path.join(self.test_dir, 'pipeline_run.yml')
- scores_path = os.path.join(self.test_dir, 'scores.csv')
- arg = [
- '',
- 'runtime',
- 'evaluate',
- '--input',
- os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'),
- '--problem',
- os.path.join(PROBLEM_DIR, 'iris_problem_1/problemDoc.json'),
- '--pipeline',
- os.path.join(PIPELINE_DIR, 'random-forest-classifier.yml'),
- '--data-pipeline',
- os.path.join(PIPELINE_DIR, 'data-preparation-no-split.yml'),
- '--scores',
- scores_path,
- '--metric',
- 'ACCURACY',
- '--metric',
- 'F1_MACRO',
- '-O',
- pipeline_run_save_path
- ]
- self._call_cli_runtime_without_fail(arg)
-
- self._assert_valid_saved_pipeline_runs(pipeline_run_save_path)
- self._validate_previous_pipeline_run_ids(pipeline_run_save_path)
-
- dataframe = pandas.read_csv(scores_path)
- self.assertEqual(list(dataframe.columns), ['metric', 'value', 'normalized', 'randomSeed', 'fold'])
- self.assertEqual(dataframe.values.tolist(), [['ACCURACY', 1.0, 1.0, 0, 0], ['F1_MACRO', 1.0, 1.0, 0, 0]])
-
- def test_evaluate_without_problem(self):
- pipeline_run_save_path = os.path.join(self.test_dir, 'pipeline_run.yml')
- scores_path = os.path.join(self.test_dir, 'scores.csv')
- arg = [
- '',
- 'runtime',
- 'evaluate',
- '--input',
- os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'),
- '--pipeline',
- os.path.join(PIPELINE_DIR, 'random-classifier.yml'),
- '--data-pipeline',
- os.path.join(PIPELINE_DIR, 'data-preparation-no-split.yml'),
- '--scoring-pipeline',
- os.path.join(PIPELINE_DIR, 'fake_compute_score.yml'),
- # this argument has no effect
- '--metric',
- 'ACCURACY',
- '--scores',
- scores_path,
- '-O',
- pipeline_run_save_path
- ]
- logging_records = self._call_cli_runtime_without_fail(arg)
-
- self.assertEqual(len(logging_records), 1)
- self.assertEqual(logging_records[0].msg, "Not all provided hyper-parameters for the scoring pipeline %(pipeline_id)s were used: %(unused_params)s")
-
- self._assert_valid_saved_pipeline_runs(pipeline_run_save_path)
- self._validate_previous_pipeline_run_ids(pipeline_run_save_path)
-
- dataframe = pandas.read_csv(scores_path)
- self.assertEqual(list(dataframe.columns), ['metric', 'value', 'normalized', 'randomSeed', 'fold'])
- self.assertEqual(dataframe.values.tolist(), [['ACCURACY', 1.0, 1.0, 0, 0]])
-
- def test_score(self):
- pipeline_run_save_path = os.path.join(self.test_dir, 'pipeline_run.yml')
- fitted_pipeline_path = os.path.join(self.test_dir, 'iris-pipeline')
- self._fit_iris_random_forest(fitted_pipeline_path=fitted_pipeline_path)
- self.assertTrue(os.path.isfile(fitted_pipeline_path))
-
- scores_path = os.path.join(self.test_dir, 'scores.csv')
- arg = [
- '',
- 'runtime',
- 'score',
- '--fitted-pipeline',
- fitted_pipeline_path,
- '--test-input',
- os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'),
- '--score-input',
- os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'),
- '--scores',
- scores_path,
- '--metric',
- 'F1_MACRO',
- '--metric',
- 'ACCURACY',
- '-O',
- pipeline_run_save_path,
- ]
- self._call_cli_runtime_without_fail(arg)
-
- self._assert_valid_saved_pipeline_runs(pipeline_run_save_path)
-
- self.assertTrue(os.path.isfile(scores_path), 'scores were not generated')
-
- dataframe = pandas.read_csv(scores_path)
-
- self.assertEqual(list(dataframe.columns), ['metric', 'value', 'normalized', 'randomSeed'])
- self.assertEqual(dataframe.values.tolist(), [['F1_MACRO', 1.0, 1.0, 0], ['ACCURACY', 1.0, 1.0, 0]])
-
- def test_score_without_problem_without_metric(self):
- pipeline_run_save_path = os.path.join(self.test_dir, 'pipeline_run.yml')
- fitted_pipeline_path = os.path.join(self.test_dir, 'iris-pipeline')
- self._fit_iris_random_classifier_without_problem(fitted_pipeline_path=fitted_pipeline_path)
- self.assertTrue(os.path.isfile(fitted_pipeline_path))
-
- scores_path = os.path.join(self.test_dir, 'scores.csv')
- arg = [
- '',
- 'runtime',
- 'score',
- '--fitted-pipeline',
- fitted_pipeline_path,
- '--test-input',
- os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'),
- '--score-input',
- os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'),
- '--scoring-pipeline',
- os.path.join(PIPELINE_DIR, 'fake_compute_score.yml'),
- '--scores',
- scores_path,
- '-O',
- pipeline_run_save_path,
- ]
- self._call_cli_runtime_without_fail(arg)
-
- self._assert_valid_saved_pipeline_runs(pipeline_run_save_path)
-
- self.assertTrue(os.path.isfile(scores_path), 'scores were not generated')
-
- dataframe = pandas.read_csv(scores_path)
-
- self.assertEqual(list(dataframe.columns), ['metric', 'value', 'normalized', 'randomSeed'])
- self.assertEqual(dataframe.values.tolist(), [['ACCURACY', 1.0, 1.0, 0]])
-
- def test_score_without_problem(self):
- pipeline_run_save_path = os.path.join(self.test_dir, 'pipeline_run.yml')
- fitted_pipeline_path = os.path.join(self.test_dir, 'iris-pipeline')
- self._fit_iris_random_classifier_without_problem(fitted_pipeline_path=fitted_pipeline_path)
- self.assertTrue(os.path.isfile(fitted_pipeline_path))
-
- scores_path = os.path.join(self.test_dir, 'scores.csv')
- arg = [
- '',
- 'runtime',
- 'score',
- '--fitted-pipeline',
- fitted_pipeline_path,
- '--test-input',
- os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'),
- '--score-input',
- os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'),
- '--scoring-pipeline',
- os.path.join(PIPELINE_DIR, 'fake_compute_score.yml'),
- # this argument has no effect
- '--metric',
- 'ACCURACY',
- '--scores',
- scores_path,
- '-O',
- pipeline_run_save_path,
- ]
- logging_records = self._call_cli_runtime_without_fail(arg)
-
- self.assertEqual(len(logging_records), 1)
- self.assertEqual(logging_records[0].msg, "Not all provided hyper-parameters for the scoring pipeline %(pipeline_id)s were used: %(unused_params)s")
-
- self._assert_valid_saved_pipeline_runs(pipeline_run_save_path)
-
- self.assertTrue(os.path.isfile(scores_path), 'scores were not generated')
-
- dataframe = pandas.read_csv(scores_path)
-
- self.assertEqual(list(dataframe.columns), ['metric', 'value', 'normalized', 'randomSeed'])
- self.assertEqual(dataframe.values.tolist(), [['ACCURACY', 1.0, 1.0, 0]])
-
- def test_produce(self):
- pipeline_run_save_path = os.path.join(self.test_dir, 'pipeline_run.yml')
- fitted_pipeline_path = os.path.join(self.test_dir, 'iris-pipeline')
- self._fit_iris_random_forest(fitted_pipeline_path=fitted_pipeline_path)
- self.assertTrue(os.path.isfile(fitted_pipeline_path))
-
- arg = [
- '',
- 'runtime',
- 'produce',
- '--fitted-pipeline',
- fitted_pipeline_path,
- '--test-input',
- os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'),
- '-O',
- pipeline_run_save_path,
- ]
- self._call_cli_runtime_without_fail(arg)
-
- self._assert_valid_saved_pipeline_runs(pipeline_run_save_path)
-
- def test_score_predictions(self):
- predictions_path = os.path.join(self.test_dir, 'predictions.csv')
- self._fit_iris_random_forest(predictions_path=predictions_path)
- self.assertTrue(os.path.isfile(predictions_path))
-
- scores_path = os.path.join(self.test_dir, 'scores.csv')
- arg = [
- '',
- 'runtime',
- 'score-predictions',
- '--score-input',
- os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'),
- '--problem',
- os.path.join(PROBLEM_DIR, 'iris_problem_1/problemDoc.json'),
- '--predictions',
- predictions_path,
- '--metric',
- 'ACCURACY',
- '--metric',
- 'F1_MACRO',
- '--scores',
- scores_path,
- ]
- self._call_cli_runtime_without_fail(arg)
-
- self.assertTrue(os.path.isfile(scores_path), 'scores were not generated')
-
- dataframe = pandas.read_csv(scores_path)
-
- self.assertEqual(list(dataframe.columns), ['metric', 'value', 'normalized'])
- self.assertEqual(dataframe.values.tolist(), [['ACCURACY', 1.0, 1.0], ['F1_MACRO', 1.0, 1.0]])
-
- def test_sklearn_dataset_fit_produce(self):
- self._create_sklearn_iris_problem_doc()
-
- pipeline_run_save_path = os.path.join(self.test_dir, 'pipeline_run.yml')
- arg = [
- '',
- 'runtime',
- 'fit-produce',
- '--input',
- 'sklearn://iris',
- '--input',
- 'sklearn://iris',
- '--problem',
- os.path.join(self.test_dir, 'problemDoc.json'),
- '--test-input',
- 'sklearn://iris',
- '--test-input',
- 'sklearn://iris',
- '--pipeline',
- os.path.join(PIPELINE_DIR, 'multi-input-test.json'),
- '--expose-produced-outputs',
- self.test_dir,
- '-O',
- pipeline_run_save_path,
- ]
- self._call_cli_runtime_without_fail(arg)
-
- self._assert_valid_saved_pipeline_runs(pipeline_run_save_path)
- self._validate_previous_pipeline_run_ids(pipeline_run_save_path)
-
- self.assertEqual(utils.list_files(self.test_dir), [
- 'outputs.0/data.csv',
- 'outputs.0/metadata.json',
- 'pipeline_run.yml',
- 'problemDoc.json',
- 'steps.0.produce/data.csv',
- 'steps.0.produce/metadata.json',
- 'steps.1.produce/data.csv',
- 'steps.1.produce/metadata.json',
- 'steps.2.produce/data.csv',
- 'steps.2.produce/metadata.json'
- ])
- self._assert_standard_output_metadata(prediction_type='numpy.int64')
- self._assert_prediction_sum(prediction_sum=10648, outputs_path='outputs.0/data.csv')
-
- def test_sklearn_dataset_fit_produce_without_problem(self):
- output_csv_path = os.path.join(self.test_dir, 'output.csv')
- pipeline_run_save_path = os.path.join(self.test_dir, 'pipeline_run.yml')
- fitted_pipeline_path = os.path.join(self.test_dir, 'fitted-pipeline')
- arg = [
- '',
- 'runtime',
- 'fit-produce',
- '--input',
- 'sklearn://iris',
- '--test-input',
- 'sklearn://iris',
- '--pipeline',
- os.path.join(PIPELINE_DIR, 'random-classifier.yml'),
- '--save',
- fitted_pipeline_path,
- '--output',
- output_csv_path,
- '--expose-produced-outputs',
- self.test_dir,
- '-O',
- pipeline_run_save_path,
- ]
-
- self._call_cli_runtime_without_fail(arg)
-
- self._assert_valid_saved_pipeline_runs(pipeline_run_save_path)
- self._validate_previous_pipeline_run_ids(pipeline_run_save_path)
-
- self.assertEqual(utils.list_files(self.test_dir), [
- 'fitted-pipeline',
- 'output.csv',
- 'outputs.0/data.csv',
- 'outputs.0/metadata.json',
- 'pipeline_run.yml',
- 'steps.0.produce/data.csv',
- 'steps.0.produce/metadata.json',
- 'steps.1.produce/data.csv',
- 'steps.1.produce/metadata.json',
- 'steps.2.produce/data.csv',
- 'steps.2.produce/metadata.json',
- ])
- self._assert_standard_output_metadata(prediction_type='numpy.int64')
- self._assert_prediction_sum(prediction_sum=10648, outputs_path='outputs.0/data.csv')
- self._assert_prediction_sum(prediction_sum=10648, outputs_path='output.csv')
-
- def _create_sklearn_iris_problem_doc(self):
- with open(os.path.join(PROBLEM_DIR, 'iris_problem_1/problemDoc.json'), 'r', encoding='utf8') as problem_doc_file:
- problem_doc = json.load(problem_doc_file)
-
- problem_doc['inputs']['data'][0]['datasetID'] = 'sklearn://iris'
-
- with open(os.path.join(self.test_dir, 'problemDoc.json'), 'x', encoding='utf8') as problem_doc_file:
- json.dump(problem_doc, problem_doc_file)
-
- def test_sklearn_dataset_evaluate(self):
- self._create_sklearn_iris_problem_doc()
-
- pipeline_run_save_path = os.path.join(self.test_dir, 'pipeline_run.yml')
- scores_path = os.path.join(self.test_dir, 'scores.csv')
- arg = [
- '',
- 'runtime',
- 'evaluate',
- '--input',
- 'sklearn://iris',
- '--problem',
- os.path.join(self.test_dir, 'problemDoc.json'),
- '--pipeline',
- os.path.join(PIPELINE_DIR, 'random-forest-classifier.yml'),
- '--data-pipeline',
- os.path.join(PIPELINE_DIR, 'data-preparation-no-split.yml'),
- '--scores',
- scores_path,
- '--metric',
- 'ACCURACY',
- '--metric',
- 'F1_MACRO',
- '-O',
- pipeline_run_save_path
- ]
- self._call_cli_runtime_without_fail(arg)
-
- self._assert_valid_saved_pipeline_runs(pipeline_run_save_path)
- self._validate_previous_pipeline_run_ids(pipeline_run_save_path)
-
- dataframe = pandas.read_csv(scores_path)
- self.assertEqual(list(dataframe.columns), ['metric', 'value', 'normalized', 'randomSeed', 'fold'])
- self.assertEqual(dataframe.values.tolist(), [['ACCURACY', 1.0, 1.0, 0, 0], ['F1_MACRO', 1.0, 1.0, 0, 0]])
-
- def test_sklearn_dataset_evaluate_without_problem(self):
- pipeline_run_save_path = os.path.join(self.test_dir, 'pipeline_run.yml')
- scores_path = os.path.join(self.test_dir, 'scores.csv')
- arg = [
- '',
- 'runtime',
- 'evaluate',
- '--input',
- 'sklearn://iris',
- '--pipeline',
- os.path.join(PIPELINE_DIR, 'random-classifier.yml'),
- '--data-pipeline',
- os.path.join(PIPELINE_DIR, 'data-preparation-no-split.yml'),
- '--scoring-pipeline',
- os.path.join(PIPELINE_DIR, 'fake_compute_score.yml'),
- # this argument has no effect
- '--metric',
- 'ACCURACY',
- '--scores',
- scores_path,
- '-O',
- pipeline_run_save_path
- ]
- logging_records = self._call_cli_runtime_without_fail(arg)
-
- self.assertEqual(len(logging_records), 1)
- self.assertEqual(logging_records[0].msg, "Not all provided hyper-parameters for the scoring pipeline %(pipeline_id)s were used: %(unused_params)s")
-
- self._assert_valid_saved_pipeline_runs(pipeline_run_save_path)
- self._validate_previous_pipeline_run_ids(pipeline_run_save_path)
-
- dataframe = pandas.read_csv(scores_path)
- self.assertEqual(list(dataframe.columns), ['metric', 'value', 'normalized', 'randomSeed', 'fold'])
- self.assertEqual(dataframe.values.tolist(), [['ACCURACY', 1.0, 1.0, 0, 0]])
-
- def _assert_prediction_sum(self, prediction_sum, outputs_path):
- if prediction_sum is not None:
- with open(os.path.join(self.test_dir, outputs_path), 'r') as csv_file:
- self.assertEqual(sum([int(v) for v in list(csv_file)[1:]]), prediction_sum)
-
- def _assert_standard_output_metadata(self, outputs_name='outputs.0', prediction_type='str'):
- with open(os.path.join(self.test_dir, outputs_name, 'metadata.json'), 'r') as metadata_file:
- metadata = json.load(metadata_file)
-
- self.assertEqual(
- metadata,
- [
- {
- "selector": [],
- "metadata": {
- "dimension": {
- "length": 150,
- "name": "rows",
- "semantic_types": ["https://metadata.datadrivendiscovery.org/types/TabularRow"],
- },
- "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/container.json",
- "semantic_types": ["https://metadata.datadrivendiscovery.org/types/Table"],
- "structural_type": "d3m.container.pandas.DataFrame",
- },
- },
- {
- "selector": ["__ALL_ELEMENTS__"],
- "metadata": {
- "dimension": {
- "length": 1,
- "name": "columns",
- "semantic_types": ["https://metadata.datadrivendiscovery.org/types/TabularColumn"],
- }
- },
- },
- {"selector": ["__ALL_ELEMENTS__", 0],
- "metadata": {"name": "predictions", "structural_type": prediction_type}},
- ],
- )
-
- def _assert_nonstandard_output(self, outputs_name='outputs.1'):
- with open(os.path.join(self.test_dir, outputs_name, 'data.csv'), 'r') as csv_file:
- output_dataframe = pandas.read_csv(csv_file, index_col=False)
- learning_dataframe = pandas.read_csv(
- os.path.join(DATASET_DIR, 'iris_dataset_1/tables/learningData.csv'), index_col=False)
- self.assertTrue(learning_dataframe.equals(output_dataframe))
-
- with open(os.path.join(self.test_dir, outputs_name, 'metadata.json'), 'r') as metadata_file:
- metadata = json.load(metadata_file)
-
- self.assertEqual(
- metadata,
- [
- {
- "metadata": {
- "dimension": {
- "length": 150,
- "name": "rows",
- "semantic_types": [
- "https://metadata.datadrivendiscovery.org/types/TabularRow"
- ]
- },
- "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/container.json",
- "semantic_types": [
- "https://metadata.datadrivendiscovery.org/types/Table"
- ],
- "structural_type": "d3m.container.pandas.DataFrame"
- },
- "selector": []
- },
- {
- "metadata": {
- "dimension": {
- "length": 6,
- "name": "columns",
- "semantic_types": [
- "https://metadata.datadrivendiscovery.org/types/TabularColumn"
- ]
- }
- },
- "selector": [
- "__ALL_ELEMENTS__"
- ]
- },
- {
- "metadata": {
- "name": "d3mIndex",
- "semantic_types": [
- "http://schema.org/Integer",
- "https://metadata.datadrivendiscovery.org/types/PrimaryKey"
- ],
- "structural_type": "str"
- },
- "selector": [
- "__ALL_ELEMENTS__",
- 0
- ]
- },
- {
- "metadata": {
- "name": "sepalLength",
- "semantic_types": [
- "http://schema.org/Float",
- "https://metadata.datadrivendiscovery.org/types/Attribute"
- ],
- "structural_type": "str"
- },
- "selector": [
- "__ALL_ELEMENTS__",
- 1
- ]
- },
- {
- "metadata": {
- "name": "sepalWidth",
- "semantic_types": [
- "http://schema.org/Float",
- "https://metadata.datadrivendiscovery.org/types/Attribute"
- ],
- "structural_type": "str"
- },
- "selector": [
- "__ALL_ELEMENTS__",
- 2
- ]
- },
- {
- "metadata": {
- "name": "petalLength",
- "semantic_types": [
- "http://schema.org/Float",
- "https://metadata.datadrivendiscovery.org/types/Attribute"
- ],
- "structural_type": "str"
- },
- "selector": [
- "__ALL_ELEMENTS__",
- 3
- ]
- },
- {
- "metadata": {
- "name": "petalWidth",
- "semantic_types": [
- "http://schema.org/Float",
- "https://metadata.datadrivendiscovery.org/types/Attribute"
- ],
- "structural_type": "str"
- },
- "selector": [
- "__ALL_ELEMENTS__",
- 4
- ]
- },
- {
- "metadata": {
- "name": "species",
- "semantic_types": [
- "https://metadata.datadrivendiscovery.org/types/CategoricalData",
- "https://metadata.datadrivendiscovery.org/types/SuggestedTarget",
- "https://metadata.datadrivendiscovery.org/types/Attribute"
- ],
- "structural_type": "str"
- },
- "selector": [
- "__ALL_ELEMENTS__",
- 5
- ]
- }
- ]
- )
-
- def _assert_pipeline_runs_equal(self, pipeline_run_save_path1, pipeline_run_save_path2):
- with open(pipeline_run_save_path1, 'r') as f:
- pipeline_runs1 = list(utils.yaml_load_all(f))
-
- with open(pipeline_run_save_path2, 'r') as f:
- pipeline_runs2 = list(utils.yaml_load_all(f))
-
- self.assertEqual(len(pipeline_runs1), len(pipeline_runs2))
-
- for pipeline_run1, pipeline_run2 in zip(pipeline_runs1, pipeline_runs2):
- self.assertTrue(pipeline_run_module.PipelineRun.json_structure_equals(pipeline_run1, pipeline_run2))
-
- def test_pipeline_run_json_structure_equals(self):
- pipeline_run_save_path1 = os.path.join(self.test_dir, 'pipeline_run1.yml')
- self._fit_iris_random_forest(pipeline_run_save_path=pipeline_run_save_path1)
- self._assert_valid_saved_pipeline_runs(pipeline_run_save_path1)
-
- pipeline_run_save_path2 = os.path.join(self.test_dir, 'pipeline_run2.yml')
- self._fit_iris_random_forest(pipeline_run_save_path=pipeline_run_save_path2)
- self._assert_valid_saved_pipeline_runs(pipeline_run_save_path2)
-
- self._assert_pipeline_runs_equal(pipeline_run_save_path1, pipeline_run_save_path2)
-
- def _cache_pipeline_for_rerun(self, pipeline_path, cache_dir=None):
- """make pipeline searchable by id in test_dir"""
- with open(pipeline_path, 'r') as f:
- pipeline = utils.yaml_load(f)
- if cache_dir is None:
- cache_dir = self.test_dir
- temp_pipeline_path = os.path.join(cache_dir, pipeline['id'] + '.yml')
- with open(temp_pipeline_path, 'w') as f:
- utils.yaml_dump(pipeline, f)
-
- @staticmethod
- def _generate_seed():
- return random.randint(2**31, 2**32-1)
-
- def test_fit_rerun(self):
- dataset_path = self._get_iris_dataset_path()
- problem_path = self._get_iris_problem_path()
- pipeline_path = self._get_random_forest_pipeline_path()
- pipeline_run_save_path = self._get_pipeline_run_save_path()
-
- problem = problem_module.get_problem(problem_path)
- inputs = [dataset_module.get_dataset(dataset_path)]
- with open(pipeline_path) as f:
- pipeline = pipeline_module.Pipeline.from_yaml(f)
-
- hyperparams = [{}, {}, {'n_estimators': 19}, {}]
- random_seed = self._generate_seed()
-
- with utils.silence():
- fitted_pipeline, predictions, fit_result = runtime.fit(
- pipeline, inputs, problem_description=problem, hyperparams=hyperparams,
- random_seed=random_seed, context=metadata_base.Context.TESTING,
- )
-
- with open(pipeline_run_save_path, 'w') as f:
- fit_result.pipeline_run.to_yaml(f)
-
- self._cache_pipeline_for_rerun(pipeline_path)
-
- pipeline_rerun_save_path = self._get_pipeline_rerun_save_path()
-
- rerun_arg = [
- '',
- '--pipelines-path',
- self.test_dir,
- 'runtime',
- '--datasets',
- TEST_DATA_DIR,
- 'fit',
- '--input-run',
- pipeline_run_save_path,
- '--output-run',
- pipeline_rerun_save_path,
- ]
- self._call_cli_runtime_without_fail(rerun_arg)
-
- self._assert_valid_saved_pipeline_runs(pipeline_rerun_save_path)
- self._assert_pipeline_runs_equal(pipeline_run_save_path, pipeline_rerun_save_path)
-
- def test_produce_rerun(self):
- dataset_path = self._get_iris_dataset_path()
- problem_path = self._get_iris_problem_path()
- pipeline_path = self._get_random_forest_pipeline_path()
- pipeline_run_save_path = self._get_pipeline_run_save_path()
- fitted_pipeline_path = os.path.join(self.test_dir, 'iris-pipeline')
-
- self._fit_iris_random_forest(fitted_pipeline_path=fitted_pipeline_path)
- self.assertTrue(os.path.isfile(fitted_pipeline_path))
-
- arg = [
- '',
- 'runtime',
- 'produce',
- '--fitted-pipeline',
- fitted_pipeline_path,
- '--test-input',
- dataset_path,
- '--output-run',
- pipeline_run_save_path,
- ]
- self._call_cli_runtime_without_fail(arg)
- self._assert_valid_saved_pipeline_runs(pipeline_run_save_path)
-
- self._cache_pipeline_for_rerun(pipeline_path)
-
- pipeline_rerun_save_path = self._get_pipeline_rerun_save_path()
-
- rerun_arg = [
- '',
- '--pipelines-path',
- self.test_dir,
- 'runtime',
- '--datasets',
- TEST_DATA_DIR,
- 'produce',
- '--fitted-pipeline',
- fitted_pipeline_path,
- '--input-run',
- pipeline_run_save_path,
- '--output-run',
- pipeline_rerun_save_path,
- ]
- self._call_cli_runtime_without_fail(rerun_arg)
- self._assert_valid_saved_pipeline_runs(pipeline_rerun_save_path)
-
- self._assert_pipeline_runs_equal(pipeline_run_save_path, pipeline_rerun_save_path)
-
- def _assert_scores_equal(self, scores_path, rescores_path):
- scores = pandas.read_csv(scores_path)
- rescores = pandas.read_csv(rescores_path)
- self.assertTrue(scores.equals(rescores), '\n{}\n\n{}'.format(scores, rescores))
-
- def _assert_scores_equal_pipeline_run(self, scores_path, pipeline_run_save_path):
- scores = pandas.read_csv(scores_path)
- scores.drop('fold', axis=1, inplace=True, errors='ignore')
- scores_no_seed = scores.drop('randomSeed', axis=1, errors='ignore')
-
- with open(pipeline_run_save_path) as f:
- # TODO: always use -1?
- pipeline_run = list(utils.yaml_load_all(f))[-1]
- self.assertEqual(pipeline_run['run']['phase'], metadata_base.PipelineRunPhase.PRODUCE.name)
- # TODO: clean up preprocessing?
- pipeline_run_scores_df = pandas.DataFrame(pipeline_run['run']['results']['scores'])
- # TODO: is it possible to make pipeline run schema more compatible with scores csv schema?
- pipeline_run_scores_df['metric'] = pipeline_run_scores_df['metric'].map(lambda cell: cell['metric'])
- pipeline_run_scores_df = pipeline_run_scores_df[scores_no_seed.columns.tolist()]
-
- pandas.testing.assert_frame_equal(scores_no_seed, pipeline_run_scores_df)
- self.assertEqual(scores['randomSeed'].iloc[0], pipeline_run['random_seed'])
-
- def test_score_rerun(self):
- dataset_path = self._get_iris_dataset_path()
- problem_path = self._get_iris_problem_path()
- pipeline_path = self._get_random_forest_pipeline_path()
- pipeline_run_save_path = self._get_pipeline_run_save_path()
- fitted_pipeline_path = os.path.join(self.test_dir, 'iris-pipeline')
- scores_path = os.path.join(self.test_dir, 'scores.csv')
-
- random_seed = self._generate_seed()
- metrics = runtime.get_metrics_from_list(['ACCURACY', 'F1_MACRO'])
- scoring_params = {'add_normalized_scores': 'false'}
- scoring_random_seed = self._generate_seed()
-
- problem = problem_module.get_problem(problem_path)
- inputs = [dataset_module.get_dataset(dataset_path)]
- with open(pipeline_path) as f:
- pipeline = pipeline_module.Pipeline.from_yaml(f)
- with open(runtime.DEFAULT_SCORING_PIPELINE_PATH) as f:
- scoring_pipeline = pipeline_module.Pipeline.from_yaml(f)
-
- with utils.silence():
- fitted_pipeline, predictions, fit_result = runtime.fit(
- pipeline, inputs, problem_description=problem, random_seed=random_seed,
- context=metadata_base.Context.TESTING,
- )
- with open(fitted_pipeline_path, 'wb') as f:
- pickle.dump(fitted_pipeline, f)
-
- predictions, produce_result = runtime.produce(fitted_pipeline, inputs)
-
- scores, score_result = runtime.score(
- predictions, inputs, scoring_pipeline=scoring_pipeline,
- problem_description=problem, metrics=metrics, predictions_random_seed=random_seed,
- context=metadata_base.Context.TESTING, scoring_params=scoring_params,
- random_seed=scoring_random_seed
- )
-
- self.assertFalse(score_result.has_error(), score_result.error)
-
- scores.to_csv(scores_path)
-
- runtime.combine_pipeline_runs(
- produce_result.pipeline_run, scoring_pipeline_run=score_result.pipeline_run, score_inputs=inputs,
- metrics=metrics, scores=scores
- )
- with open(pipeline_run_save_path, 'w') as f:
- produce_result.pipeline_run.to_yaml(f)
-
- self.assertTrue(os.path.isfile(fitted_pipeline_path))
- self.assertTrue(os.path.isfile(scores_path), 'scores were not generated')
- self._assert_valid_saved_pipeline_runs(pipeline_run_save_path)
-
- dataframe = pandas.read_csv(scores_path)
-
- self.assertEqual(list(dataframe.columns), ['metric', 'value', 'randomSeed'])
- self.assertEqual(dataframe.values.tolist(), [['ACCURACY', 1.0, random_seed], ['F1_MACRO', 1.0, random_seed]])
-
- self._cache_pipeline_for_rerun(pipeline_path)
-
- pipeline_rerun_save_path = self._get_pipeline_rerun_save_path()
- rescores_path = self._get_rescores_path()
-
- rerun_arg = [
- '',
- '--pipelines-path',
- self.test_dir,
- 'runtime',
- '--datasets',
- TEST_DATA_DIR,
- 'score',
- '--fitted-pipeline',
- fitted_pipeline_path,
- '--input-run',
- pipeline_run_save_path,
- '--output-run',
- pipeline_rerun_save_path,
- '--scores',
- rescores_path,
- ]
- self._call_cli_runtime_without_fail(rerun_arg)
- self.assertTrue(os.path.isfile(pipeline_rerun_save_path))
- self._assert_valid_saved_pipeline_runs(pipeline_rerun_save_path)
- self._assert_scores_equal(scores_path, rescores_path)
- self._assert_scores_equal_pipeline_run(scores_path, pipeline_rerun_save_path)
- self._assert_pipeline_runs_equal(pipeline_run_save_path, pipeline_rerun_save_path)
-
- def test_fit_produce_rerun(self):
- dataset_path = self._get_iris_dataset_path()
- problem_path = self._get_iris_problem_path()
- pipeline_path = self._get_random_forest_pipeline_path()
- pipeline_run_save_path = self._get_pipeline_run_save_path()
-
- hyperparams = [{}, {}, {'n_estimators': 19}, {}]
- random_seed = self._generate_seed()
-
- problem = problem_module.get_problem(problem_path)
- inputs = [dataset_module.get_dataset(dataset_path)]
- with open(pipeline_path) as f:
- pipeline = pipeline_module.Pipeline.from_yaml(f)
-
- with utils.silence():
- fitted_pipeline, predictions, fit_result = runtime.fit(
- pipeline, inputs, problem_description=problem, hyperparams=hyperparams,
- random_seed=random_seed, context=metadata_base.Context.TESTING,
- )
- predictions, produce_result = runtime.produce(fitted_pipeline, inputs)
-
- with open(pipeline_run_save_path, 'w') as f:
- fit_result.pipeline_run.to_yaml(f)
- produce_result.pipeline_run.to_yaml(f, appending=True)
-
- self._cache_pipeline_for_rerun(pipeline_path)
-
- pipeline_rerun_save_path = self._get_pipeline_rerun_save_path()
-
- rerun_arg = [
- '',
- '--pipelines-path',
- self.test_dir,
- '--strict-digest',
- 'runtime',
- '--datasets',
- TEST_DATA_DIR,
- 'fit-produce',
- '--input-run',
- pipeline_run_save_path,
- '--output-run',
- pipeline_rerun_save_path,
- ]
- self._call_cli_runtime_without_fail(rerun_arg)
- self._assert_valid_saved_pipeline_runs(pipeline_rerun_save_path)
-
- self._assert_pipeline_runs_equal(pipeline_run_save_path, pipeline_rerun_save_path)
-
- def test_fit_score_rerun(self):
- dataset_path = self._get_iris_dataset_path()
- problem_path = self._get_iris_problem_path()
- pipeline_path = self._get_random_forest_pipeline_path()
- pipeline_run_save_path = self._get_pipeline_run_save_path()
- scores_path = self._get_scores_path()
-
- hyperparams = [{}, {}, {'n_estimators': 19}, {}]
- random_seed = self._generate_seed()
- metrics = runtime.get_metrics_from_list(['ACCURACY', 'F1_MACRO'])
- scoring_params = {'add_normalized_scores': 'false'}
- scoring_random_seed = self._generate_seed()
-
- problem = problem_module.get_problem(problem_path)
- inputs = [dataset_module.get_dataset(dataset_path)]
- with open(pipeline_path) as f:
- pipeline = pipeline_module.Pipeline.from_yaml(f)
- with open(runtime.DEFAULT_SCORING_PIPELINE_PATH) as f:
- scoring_pipeline = pipeline_module.Pipeline.from_yaml(f)
-
- with utils.silence():
- fitted_pipeline, predictions, fit_result = runtime.fit(
- pipeline, inputs, problem_description=problem, hyperparams=hyperparams,
- random_seed=random_seed, context=metadata_base.Context.TESTING,
- )
- self.assertFalse(fit_result.has_error(), fit_result.error)
-
- predictions, produce_result = runtime.produce(fitted_pipeline, inputs)
- self.assertFalse(produce_result.has_error(), produce_result.error)
-
- scores, score_result = runtime.score(
- predictions, inputs, scoring_pipeline=scoring_pipeline,
- problem_description=problem, metrics=metrics,
- predictions_random_seed=fitted_pipeline.random_seed,
- context=metadata_base.Context.TESTING, scoring_params=scoring_params, random_seed=scoring_random_seed
- )
-
- self.assertFalse(score_result.has_error(), score_result.error)
- scores.to_csv(scores_path)
-
- runtime.combine_pipeline_runs(
- produce_result.pipeline_run, scoring_pipeline_run=score_result.pipeline_run, score_inputs=inputs,
- metrics=metrics, scores=scores
- )
-
- with open(pipeline_run_save_path, 'w') as f:
- fit_result.pipeline_run.to_yaml(f)
- produce_result.pipeline_run.to_yaml(f, appending=True)
-
- self._assert_valid_saved_pipeline_runs(pipeline_run_save_path)
-
- self._cache_pipeline_for_rerun(pipeline_path)
-
- pipeline_rerun_save_path = self._get_pipeline_rerun_save_path()
- rescores_path = self._get_rescores_path()
-
- rerun_arg = [
- '',
- '--pipelines-path',
- self.test_dir,
- '--strict-digest',
- 'runtime',
- '--datasets',
- TEST_DATA_DIR,
- 'fit-score',
- '--input-run',
- pipeline_run_save_path,
- '--scores',
- rescores_path,
- '--output-run',
- pipeline_rerun_save_path,
- ]
- self._call_cli_runtime_without_fail(rerun_arg)
- self._assert_valid_saved_pipeline_runs(pipeline_rerun_save_path)
- self._assert_scores_equal(scores_path, rescores_path)
- self._assert_scores_equal_pipeline_run(scores_path, pipeline_rerun_save_path)
- self._assert_pipeline_runs_equal(pipeline_run_save_path, pipeline_rerun_save_path)
-
- def test_evaluate_rerun(self):
- dataset_path = self._get_iris_dataset_path()
- problem_path = self._get_iris_problem_path()
- pipeline_path = self._get_random_forest_pipeline_path()
- data_pipeline_path = self._get_train_test_split_data_pipeline_path()
- pipeline_run_save_path = self._get_pipeline_run_save_path()
- scores_path = self._get_scores_path()
-
- hyperparams = [{}, {}, {'n_estimators': 19}, {}]
- random_seed = self._generate_seed()
- metrics = runtime.get_metrics_from_list(['ACCURACY', 'F1_MACRO'])
- scoring_params = {'add_normalized_scores': 'false'}
- scoring_random_seed = self._generate_seed()
- data_params = {'shuffle': 'true', 'stratified': 'true', 'train_score_ratio': '0.59'}
- data_random_seed = self._generate_seed()
-
- problem = problem_module.get_problem(problem_path)
- inputs = [dataset_module.get_dataset(dataset_path)]
- with open(pipeline_path) as f:
- pipeline = pipeline_module.Pipeline.from_yaml(f)
- with open(data_pipeline_path) as f:
- data_pipeline = pipeline_module.Pipeline.from_yaml(f)
- with open(runtime.DEFAULT_SCORING_PIPELINE_PATH) as f:
- scoring_pipeline = pipeline_module.Pipeline.from_yaml(f)
-
- with utils.silence():
- dummy_runtime_environment = pipeline_run_module.RuntimeEnvironment(worker_id='dummy worker id')
-
- all_scores, all_results = runtime.evaluate(
- pipeline, inputs, data_pipeline=data_pipeline, scoring_pipeline=scoring_pipeline,
- problem_description=problem, data_params=data_params, metrics=metrics,
- context=metadata_base.Context.TESTING, scoring_params=scoring_params,
- hyperparams=hyperparams, random_seed=random_seed,
- data_random_seed=data_random_seed, scoring_random_seed=scoring_random_seed,
- runtime_environment=dummy_runtime_environment,
- )
-
- self.assertEqual(len(all_scores), 1)
- scores = runtime.combine_folds(all_scores)
- scores.to_csv(scores_path)
-
- if any(result.has_error() for result in all_results):
- self.fail([result.error for result in all_results if result.has_error()][0])
-
- with open(pipeline_run_save_path, 'w') as f:
- for i, pipeline_run in enumerate(all_results.pipeline_runs):
- pipeline_run.to_yaml(f, appending=i>0)
-
- self._cache_pipeline_for_rerun(pipeline_path)
- self._cache_pipeline_for_rerun(data_pipeline_path)
-
- pipeline_rerun_save_path = self._get_pipeline_rerun_save_path()
- rescores_path = self._get_rescores_path()
-
- rerun_arg = [
- '',
- '--pipelines-path',
- self.test_dir,
- 'runtime',
- '--datasets',
- TEST_DATA_DIR,
- 'evaluate',
- '--input-run',
- pipeline_run_save_path,
- '--output-run',
- pipeline_rerun_save_path,
- '--scores',
- rescores_path,
- ]
- self._call_cli_runtime_without_fail(rerun_arg)
- self._assert_valid_saved_pipeline_runs(pipeline_rerun_save_path)
- self._assert_scores_equal(scores_path, rescores_path)
- self._assert_scores_equal_pipeline_run(scores_path, pipeline_rerun_save_path)
- self._assert_pipeline_runs_equal(pipeline_run_save_path, pipeline_rerun_save_path)
-
- # See: https://gitlab.com/datadrivendiscovery/d3m/issues/406
- # TODO: Test rerun validation code (that we throw exceptions on invalid pipeline runs).
- # TODO: Test rerun with multiple inputs (non-standard pipeline).
- # TODO: Test rerun without problem description.
- # TODO: Test evaluate rerun with data split file.
-
- def test_validate_gzipped_pipeline_run(self):
- # First, generate the pipeline run file
- pipeline_run_save_path = self._get_pipeline_run_save_path()
- gzip_pipeline_run_save_path = '{pipeline_run_save_path}.gz'.format(pipeline_run_save_path=pipeline_run_save_path)
- fitted_pipeline_path = os.path.join(self.test_dir, 'fitted-pipeline')
- self._fit_iris_random_forest(
- fitted_pipeline_path=fitted_pipeline_path, pipeline_run_save_path=pipeline_run_save_path
- )
-
- # Second, gzip the pipeline run file
- with open(pipeline_run_save_path, 'rb') as file_in:
- with gzip.open(gzip_pipeline_run_save_path, 'wb') as file_out:
- shutil.copyfileobj(file_in, file_out)
- os.remove(pipeline_run_save_path)
-
- # Third, ensure that calling 'pipeline-run validate' on the gzipped pipeline run file is successful
- arg = [
- '',
- 'pipeline-run',
- 'validate',
- gzip_pipeline_run_save_path,
- ]
- self._call_cli_runtime_without_fail(arg)
-
- def test_help_message(self):
- arg = [
- '',
- 'runtime',
- 'fit',
- '--version',
- ]
-
- with io.StringIO() as buffer:
- with contextlib.redirect_stderr(buffer):
- with self.assertRaises(SystemExit):
- cli.main(arg)
-
- help = buffer.getvalue()
- self.assertTrue('usage: d3m runtime fit' in help, help)
-
-
- if __name__ == '__main__':
- unittest.main()
|