diff --git a/tods/data_processing/ConstructPredictions.py b/tods/data_processing/ConstructPredictions.py index 37bc57f..d84d56a 100644 --- a/tods/data_processing/ConstructPredictions.py +++ b/tods/data_processing/ConstructPredictions.py @@ -96,14 +96,14 @@ class ConstructPredictionsPrimitive(transformer.TransformerPrimitiveBase[Inputs, return base.CallResult(outputs) def _filter_index_columns(self, inputs_metadata: metadata_base.DataMetadata, index_columns: typing.Sequence[int]) -> typing.Sequence[int]: - if self.hyperparams['use_columns']: + if self.hyperparams['use_columns']: # pragma: no cover index_columns = [index_column_index for index_column_index in index_columns if index_column_index in self.hyperparams['use_columns']] if not index_columns: raise ValueError("No index columns listed in \"use_columns\" hyper-parameter, but index columns are required.") else: index_columns = [index_column_index for index_column_index in index_columns if index_column_index not in self.hyperparams['exclude_columns']] - if not index_columns: + if not index_columns: # pragma: no cover raise ValueError("All index columns listed in \"exclude_columns\" hyper-parameter, but index columns are required.") names = [] @@ -113,11 +113,11 @@ class ConstructPredictionsPrimitive(transformer.TransformerPrimitiveBase[Inputs, if index_metadata.get('name', None): names.append(index_metadata['name']) - if 'd3mIndex' not in names: + if 'd3mIndex' not in names: # pragma: no cover raise ValueError("\"d3mIndex\" index column is missing.") names_set = set(names) - if len(names) != len(names_set): + if len(names) != len(names_set): # pragma: no cover duplicate_names = names for name in names_set: # Removes just the first occurrence. @@ -135,14 +135,14 @@ class ConstructPredictionsPrimitive(transformer.TransformerPrimitiveBase[Inputs, index_columns = self._filter_index_columns(inputs_metadata, index_columns) - if self.hyperparams['use_columns']: + if self.hyperparams['use_columns']: # pragma: no cover target_columns = [target_column_index for target_column_index in target_columns if target_column_index in self.hyperparams['use_columns']] if not target_columns: raise ValueError("No target columns listed in \"use_columns\" hyper-parameter, but target columns are required.") else: target_columns = [target_column_index for target_column_index in target_columns if target_column_index not in self.hyperparams['exclude_columns']] - if not target_columns: + if not target_columns: # pragma: no cover raise ValueError("All target columns listed in \"exclude_columns\" hyper-parameter, but target columns are required.") assert index_columns @@ -153,7 +153,7 @@ class ConstructPredictionsPrimitive(transformer.TransformerPrimitiveBase[Inputs, def _get_confidence_columns(self, inputs_metadata: metadata_base.DataMetadata) -> typing.List[int]: confidence_columns = inputs_metadata.list_columns_with_semantic_types(('https://metadata.datadrivendiscovery.org/types/Confidence',)) - if self.hyperparams['use_columns']: + if self.hyperparams['use_columns']:# pragma: no cover confidence_columns = [confidence_column_index for confidence_column_index in confidence_columns if confidence_column_index in self.hyperparams['use_columns']] else: confidence_columns = [confidence_column_index for confidence_column_index in confidence_columns if confidence_column_index not in self.hyperparams['exclude_columns']] @@ -176,7 +176,7 @@ class ConstructPredictionsPrimitive(transformer.TransformerPrimitiveBase[Inputs, return outputs - def _update_confidence_columns(self, inputs_metadata: metadata_base.DataMetadata, confidence_columns: typing.Sequence[int]) -> metadata_base.DataMetadata: + def _update_confidence_columns(self, inputs_metadata: metadata_base.DataMetadata, confidence_columns: typing.Sequence[int]) -> metadata_base.DataMetadata: # pragma: no cover output_columns_length = inputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] outputs_metadata = inputs_metadata @@ -193,17 +193,17 @@ class ConstructPredictionsPrimitive(transformer.TransformerPrimitiveBase[Inputs, if not index_columns: reference_index_columns = reference.metadata.get_index_columns() - if not reference_index_columns: + if not reference_index_columns: # pragma: no cover raise ValueError("Cannot find an index column in reference data, but index column is required.") filtered_index_columns = self._filter_index_columns(reference.metadata, reference_index_columns) index = reference.select_columns(filtered_index_columns) - else: + else: # pragma: no cover filtered_index_columns = self._filter_index_columns(inputs.metadata, index_columns) index = inputs.select_columns(filtered_index_columns) if not target_columns: - if index_columns: + if index_columns: # pragma: no cover raise ValueError("No target columns in input data, but index column(s) present.") # We assume all inputs are targets. @@ -220,10 +220,10 @@ class ConstructPredictionsPrimitive(transformer.TransformerPrimitiveBase[Inputs, return index.append_columns(targets) - def multi_produce(self, *, produce_methods: typing.Sequence[str], inputs: Inputs, reference: Inputs, timeout: float = None, iterations: int = None) -> base.MultiCallResult: # type: ignore + def multi_produce(self, *, produce_methods: typing.Sequence[str], inputs: Inputs, reference: Inputs, timeout: float = None, iterations: int = None) -> base.MultiCallResult: # pragma: no cover return self._multi_produce(produce_methods=produce_methods, timeout=timeout, iterations=iterations, inputs=inputs, reference=reference) - def fit_multi_produce(self, *, produce_methods: typing.Sequence[str], inputs: Inputs, reference: Inputs, timeout: float = None, iterations: int = None) -> base.MultiCallResult: # type: ignore + def fit_multi_produce(self, *, produce_methods: typing.Sequence[str], inputs: Inputs, reference: Inputs, timeout: float = None, iterations: int = None) -> base.MultiCallResult: # pragma: no cover return self._fit_multi_produce(produce_methods=produce_methods, timeout=timeout, iterations=iterations, inputs=inputs, reference=reference) def _get_target_names(self, metadata: metadata_base.DataMetadata) -> typing.List[typing.Union[str, None]]: @@ -239,7 +239,7 @@ class ConstructPredictionsPrimitive(transformer.TransformerPrimitiveBase[Inputs, def _update_targets_metadata(self, metadata: metadata_base.DataMetadata, target_names: typing.Sequence[typing.Union[str, None]]) -> metadata_base.DataMetadata: targets_length = metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] - if targets_length != len(target_names): + if targets_length != len(target_names): # pragma: no cover raise ValueError("Not an expected number of target columns to apply names for. Expected {target_names}, provided {targets_length}.".format( target_names=len(target_names), targets_length=targets_length, @@ -250,7 +250,7 @@ class ConstructPredictionsPrimitive(transformer.TransformerPrimitiveBase[Inputs, metadata = metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, column_index), 'https://metadata.datadrivendiscovery.org/types/PredictedTarget') # We do not have it, let's skip it and hope for the best. - if target_name is None: + if target_name is None: # pragma: no cover continue metadata = metadata.update_column(column_index, { diff --git a/tods/tests/data_processing/test_ConstructPredictions.py b/tods/tests/data_processing/test_ConstructPredictions.py index 0d5ea62..d1f3c8c 100644 --- a/tods/tests/data_processing/test_ConstructPredictions.py +++ b/tods/tests/data_processing/test_ConstructPredictions.py @@ -71,6 +71,7 @@ class ConstructPredictionsPrimitiveTestCase(unittest.TestCase): + def _test_metadata(self, metadata, no_metadata=False): self.maxDiff = None @@ -125,6 +126,121 @@ class ConstructPredictionsPrimitiveTestCase(unittest.TestCase): ], }) + def test_all_columns(self): + dataframe = self._get_yahoo_dataframe() + + # We use all columns. Output has to be just index and targets. + targets = copy.copy(dataframe) + + # We pretend these are our predictions. + targets.metadata = targets.metadata.remove_semantic_type((metadata_base.ALL_ELEMENTS, 5), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + targets.metadata = targets.metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, 5), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget') + + hyperparams_class = ConstructPredictions.ConstructPredictionsPrimitive.metadata.get_hyperparams() + + construct_primitive = ConstructPredictions.ConstructPredictionsPrimitive( + hyperparams=hyperparams_class.defaults()) + + call_metadata = construct_primitive.produce(inputs=targets, reference=dataframe) + + dataframe = call_metadata.value + + self.assertEqual(list(dataframe.columns), ['d3mIndex', 'value_3']) + + self._test_metadata(dataframe.metadata) + + def test_missing_index(self): + dataframe = self._get_yahoo_dataframe() + + # We just use all columns. + targets = copy.copy(dataframe) + + # We pretend these are our predictions. + targets.metadata = targets.metadata.remove_semantic_type((metadata_base.ALL_ELEMENTS, 5), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + targets.metadata = targets.metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, 5), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget') + + # Remove primary index. This one has to be reconstructed. + targets = targets.remove_columns([0]) + + hyperparams_class = ConstructPredictions.ConstructPredictionsPrimitive.metadata.get_hyperparams() + + construct_primitive = ConstructPredictions.ConstructPredictionsPrimitive( + hyperparams=hyperparams_class.defaults()) + + call_metadata = construct_primitive.produce(inputs=targets, reference=dataframe) + + dataframe = call_metadata.value + + self.assertEqual(list(dataframe.columns), ['d3mIndex', 'value_3']) + + self._test_metadata(dataframe.metadata) + + def test_just_targets_no_metadata(self): + dataframe = self._get_yahoo_dataframe() + + hyperparams_class = ExtractColumnsBySemanticTypes.ExtractColumnsBySemanticTypesPrimitive.metadata.get_hyperparams() + + # We extract just targets. + primitive = ExtractColumnsBySemanticTypes.ExtractColumnsBySemanticTypesPrimitive( + hyperparams=hyperparams_class.defaults().replace( + {'semantic_types': ('https://metadata.datadrivendiscovery.org/types/Target',)})) + + call_metadata = primitive.produce(inputs=dataframe) + + targets = call_metadata.value + + # Remove all metadata. + targets.metadata = metadata_base.DataMetadata().generate(targets) + + hyperparams_class = ConstructPredictions.ConstructPredictionsPrimitive.metadata.get_hyperparams() + + construct_primitive = ConstructPredictions.ConstructPredictionsPrimitive( + hyperparams=hyperparams_class.defaults()) + + call_metadata = construct_primitive.produce(inputs=targets, reference=dataframe) + + dataframe = call_metadata.value + + self.assertEqual(list(dataframe.columns), ['d3mIndex', 'value_3']) + + self._test_metadata(dataframe.metadata, True) + + def test_float_vector(self): + dataframe = container.DataFrame({ + 'd3mIndex': [0], + 'target': [container.ndarray(numpy.array([3, 5, 9, 10]))], + }, generate_metadata=True) + + # Update metadata. + dataframe.metadata = dataframe.metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey') + dataframe.metadata = dataframe.metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, 1), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget') + + hyperparams_class = ConstructPredictions.ConstructPredictionsPrimitive.metadata.get_hyperparams() + + construct_primitive = ConstructPredictions.ConstructPredictionsPrimitive( + hyperparams=hyperparams_class.defaults()) + + dataframe = construct_primitive.produce(inputs=dataframe, reference=dataframe).value + + self.assertEqual(list(dataframe.columns), ['d3mIndex', 'target']) + + self.assertEqual(dataframe.values.tolist(), [ + [0, '3,5,9,10'], + ]) + + self.assertEqual(dataframe.metadata.query_column(1), { + 'structural_type': str, + 'name': 'target', + 'semantic_types': ( + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget', + ), + }) if __name__ == '__main__':