|
- from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple
- from numpy import ndarray
- from collections import OrderedDict
- from scipy import sparse
- import os
- import sklearn
- import numpy
- import typing
-
- # Custom import commands if any
- from sklearn.gaussian_process.gpr import GaussianProcessRegressor
-
-
- from d3m.container.numpy import ndarray as d3m_ndarray
- from d3m.container import DataFrame as d3m_dataframe
- from d3m.metadata import hyperparams, params, base as metadata_base
- from d3m import utils
- from d3m.base import utils as base_utils
- from d3m.exceptions import PrimitiveNotFittedError
- from d3m.primitive_interfaces.base import CallResult, DockerContainer
-
- from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase
- from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin
- from d3m import exceptions
- import pandas
-
-
-
- Inputs = d3m_dataframe
- Outputs = d3m_dataframe
-
-
- class Params(params.Params):
- X_train_: Optional[ndarray]
- y_train_: Optional[ndarray]
- kernel_: Optional[Callable]
- alpha_: Optional[ndarray]
- log_marginal_likelihood_value_: Optional[float]
- _y_train_mean: Optional[ndarray]
- _rng: Optional[numpy.random.mtrand.RandomState]
- L_: Optional[ndarray]
- _K_inv: Optional[object]
- input_column_names: Optional[Any]
- target_names_: Optional[Sequence[Any]]
- training_indices_: Optional[Sequence[int]]
- target_column_indices_: Optional[Sequence[int]]
- target_columns_metadata_: Optional[List[OrderedDict]]
-
-
-
- class Hyperparams(hyperparams.Hyperparams):
- alpha = hyperparams.Union(
- configuration=OrderedDict({
- 'float': hyperparams.Hyperparameter[float](
- default=1e-10,
- semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
- ),
- 'ndarray': hyperparams.Hyperparameter[ndarray](
- default=numpy.array([]),
- semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
- )
- }),
- default='float',
- description='Value added to the diagonal of the kernel matrix during fitting. Larger values correspond to increased noise level in the observations and reduce potential numerical issue during fitting. If an array is passed, it must have the same number of entries as the data used for fitting and is used as datapoint-dependent noise level. Note that this is equivalent to adding a WhiteKernel with c=alpha. Allowing to specify the noise level directly as a parameter is mainly for convenience and for consistency with Ridge.',
- semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
- )
- optimizer = hyperparams.Constant(
- default='fmin_l_bfgs_b',
- description='Can either be one of the internally supported optimizers for optimizing the kernel\'s parameters, specified by a string, or an externally defined optimizer passed as a callable. If a callable is passed, it must have the signature:: def optimizer(obj_func, initial_theta, bounds): # * \'obj_func\' is the objective function to be maximized, which # takes the hyperparameters theta as parameter and an # optional flag eval_gradient, which determines if the # gradient is returned additionally to the function value # * \'initial_theta\': the initial value for theta, which can be # used by local optimizers # * \'bounds\': the bounds on the values of theta .... # Returned are the best found hyperparameters theta and # the corresponding value of the target function. return theta_opt, func_min Per default, the \'fmin_l_bfgs_b\' algorithm from scipy.optimize is used. If None is passed, the kernel\'s parameters are kept fixed. Available internal optimizers are:: \'fmin_l_bfgs_b\'',
- semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
- )
- n_restarts_optimizer = hyperparams.Bounded[int](
- default=0,
- lower=0,
- upper=None,
- description='The number of restarts of the optimizer for finding the kernel\'s parameters which maximize the log-marginal likelihood. The first run of the optimizer is performed from the kernel\'s initial parameters, the remaining ones (if any) from thetas sampled log-uniform randomly from the space of allowed theta-values. If greater than 0, all bounds must be finite. Note that n_restarts_optimizer == 0 implies that one run is performed.',
- semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
- )
- normalize_y = hyperparams.UniformBool(
- default=False,
- description='Whether the target values y are normalized, i.e., the mean of the observed target values become zero. This parameter should be set to True if the target values\' mean is expected to differ considerable from zero. When enabled, the normalization effectively modifies the GP\'s prior based on the data, which contradicts the likelihood principle; normalization is thus disabled per default. copy_X_train : bool, optional (default: True) If True, a persistent copy of the training data is stored in the object. Otherwise, just a reference to the training data is stored, which might cause predictions to change if the data is modified externally.',
- semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
- )
-
- use_inputs_columns = hyperparams.Set(
- elements=hyperparams.Hyperparameter[int](-1),
- default=(),
- semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
- description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.",
- )
- use_outputs_columns = hyperparams.Set(
- elements=hyperparams.Hyperparameter[int](-1),
- default=(),
- semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
- description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.",
- )
- exclude_inputs_columns = hyperparams.Set(
- elements=hyperparams.Hyperparameter[int](-1),
- default=(),
- semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
- description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.",
- )
- exclude_outputs_columns = hyperparams.Set(
- elements=hyperparams.Hyperparameter[int](-1),
- default=(),
- semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
- description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.",
- )
- return_result = hyperparams.Enumeration(
- values=['append', 'replace', 'new'],
- default='new',
- semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
- description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.",
- )
- use_semantic_types = hyperparams.UniformBool(
- default=False,
- semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
- description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe"
- )
- add_index_columns = hyperparams.UniformBool(
- default=False,
- semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
- description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".",
- )
- error_on_no_input = hyperparams.UniformBool(
- default=True,
- semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
- description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.",
- )
-
- return_semantic_type = hyperparams.Enumeration[str](
- values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'],
- default='https://metadata.datadrivendiscovery.org/types/PredictedTarget',
- description='Decides what semantic type to attach to generated output',
- semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter']
- )
-
- class SKGaussianProcessRegressor(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]):
- """
- Primitive wrapping for sklearn GaussianProcessRegressor
- `sklearn documentation <https://scikit-learn.org/stable/modules/generated/sklearn.gaussian_process.GaussianProcessRegressor.html>`_
-
- """
-
- __author__ = "JPL MARVIN"
- metadata = metadata_base.PrimitiveMetadata({
- "algorithm_types": [metadata_base.PrimitiveAlgorithmType.GAUSSIAN_PROCESS, ],
- "name": "sklearn.gaussian_process.gpr.GaussianProcessRegressor",
- "primitive_family": metadata_base.PrimitiveFamily.REGRESSION,
- "python_path": "d3m.primitives.regression.gaussian_process.SKlearn",
- "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.gaussian_process.GaussianProcessRegressor.html']},
- "version": "2019.11.13",
- "id": "3894e630-d67b-35d9-ab78-233e264f6324",
- "hyperparams_to_tune": ['alpha'],
- 'installation': [
- {'type': metadata_base.PrimitiveInstallationType.PIP,
- 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format(
- git_commit=utils.current_git_commit(os.path.dirname(__file__)),
- ),
- }]
- })
-
- def __init__(self, *,
- hyperparams: Hyperparams,
- random_seed: int = 0,
- docker_containers: Dict[str, DockerContainer] = None) -> None:
-
- super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers)
-
- # False
- self._clf = GaussianProcessRegressor(
- alpha=self.hyperparams['alpha'],
- optimizer=self.hyperparams['optimizer'],
- n_restarts_optimizer=self.hyperparams['n_restarts_optimizer'],
- normalize_y=self.hyperparams['normalize_y'],
- random_state=self.random_seed,
- )
-
- self._inputs = None
- self._outputs = None
- self._training_inputs = None
- self._training_outputs = None
- self._target_names = None
- self._training_indices = None
- self._target_column_indices = None
- self._target_columns_metadata: List[OrderedDict] = None
- self._input_column_names = None
- self._fitted = False
- self._new_training_data = False
-
- def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
- self._inputs = inputs
- self._outputs = outputs
- self._fitted = False
- self._new_training_data = True
-
- def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
- if self._inputs is None or self._outputs is None:
- raise ValueError("Missing training data.")
-
- if not self._new_training_data:
- return CallResult(None)
- self._new_training_data = False
-
- self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams)
- self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams)
- self._input_column_names = self._training_inputs.columns
-
- if len(self._training_indices) > 0 and len(self._target_column_indices) > 0:
- self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams)
- sk_training_output = self._training_outputs.values
-
- shape = sk_training_output.shape
- if len(shape) == 2 and shape[1] == 1:
- sk_training_output = numpy.ravel(sk_training_output)
-
- self._clf.fit(self._training_inputs, sk_training_output)
- self._fitted = True
- else:
- if self.hyperparams['error_on_no_input']:
- raise RuntimeError("No input columns were selected")
- self.logger.warn("No input columns were selected")
-
- return CallResult(None)
-
-
-
- def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
- sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams)
- output = []
- if len(sk_inputs.columns):
- try:
- sk_output = self._clf.predict(sk_inputs)
- except sklearn.exceptions.NotFittedError as error:
- raise PrimitiveNotFittedError("Primitive not fitted.") from error
- # For primitives that allow predicting without fitting like GaussianProcessRegressor
- if not self._fitted:
- raise PrimitiveNotFittedError("Primitive not fitted.")
- if sparse.issparse(sk_output):
- sk_output = sk_output.toarray()
- output = self._wrap_predictions(inputs, sk_output)
- output.columns = self._target_names
- output = [output]
- else:
- if self.hyperparams['error_on_no_input']:
- raise RuntimeError("No input columns were selected")
- self.logger.warn("No input columns were selected")
- outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'],
- add_index_columns=self.hyperparams['add_index_columns'],
- inputs=inputs, column_indices=self._target_column_indices,
- columns_list=output)
-
- return CallResult(outputs)
-
-
- def get_params(self) -> Params:
- if not self._fitted:
- return Params(
- X_train_=None,
- y_train_=None,
- kernel_=None,
- alpha_=None,
- log_marginal_likelihood_value_=None,
- _y_train_mean=None,
- _rng=None,
- L_=None,
- _K_inv=None,
- input_column_names=self._input_column_names,
- training_indices_=self._training_indices,
- target_names_=self._target_names,
- target_column_indices_=self._target_column_indices,
- target_columns_metadata_=self._target_columns_metadata
- )
-
- return Params(
- X_train_=getattr(self._clf, 'X_train_', None),
- y_train_=getattr(self._clf, 'y_train_', None),
- kernel_=getattr(self._clf, 'kernel_', None),
- alpha_=getattr(self._clf, 'alpha_', None),
- log_marginal_likelihood_value_=getattr(self._clf, 'log_marginal_likelihood_value_', None),
- _y_train_mean=getattr(self._clf, '_y_train_mean', None),
- _rng=getattr(self._clf, '_rng', None),
- L_=getattr(self._clf, 'L_', None),
- _K_inv=getattr(self._clf, '_K_inv', None),
- input_column_names=self._input_column_names,
- training_indices_=self._training_indices,
- target_names_=self._target_names,
- target_column_indices_=self._target_column_indices,
- target_columns_metadata_=self._target_columns_metadata
- )
-
- def set_params(self, *, params: Params) -> None:
- self._clf.X_train_ = params['X_train_']
- self._clf.y_train_ = params['y_train_']
- self._clf.kernel_ = params['kernel_']
- self._clf.alpha_ = params['alpha_']
- self._clf.log_marginal_likelihood_value_ = params['log_marginal_likelihood_value_']
- self._clf._y_train_mean = params['_y_train_mean']
- self._clf._rng = params['_rng']
- self._clf.L_ = params['L_']
- self._clf._K_inv = params['_K_inv']
- self._input_column_names = params['input_column_names']
- self._training_indices = params['training_indices_']
- self._target_names = params['target_names_']
- self._target_column_indices = params['target_column_indices_']
- self._target_columns_metadata = params['target_columns_metadata_']
-
- if params['X_train_'] is not None:
- self._fitted = True
- if params['y_train_'] is not None:
- self._fitted = True
- if params['kernel_'] is not None:
- self._fitted = True
- if params['alpha_'] is not None:
- self._fitted = True
- if params['log_marginal_likelihood_value_'] is not None:
- self._fitted = True
- if params['_y_train_mean'] is not None:
- self._fitted = True
- if params['_rng'] is not None:
- self._fitted = True
- if params['L_'] is not None:
- self._fitted = True
- if params['_K_inv'] is not None:
- self._fitted = True
-
-
-
-
-
-
-
- @classmethod
- def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams):
- if not hyperparams['use_semantic_types']:
- return inputs, list(range(len(inputs.columns)))
-
- inputs_metadata = inputs.metadata
-
- def can_produce_column(column_index: int) -> bool:
- return cls._can_produce_column(inputs_metadata, column_index, hyperparams)
-
- columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata,
- use_columns=hyperparams['use_inputs_columns'],
- exclude_columns=hyperparams['exclude_inputs_columns'],
- can_use_column=can_produce_column)
- return inputs.iloc[:, columns_to_produce], columns_to_produce
- # return columns_to_produce
-
- @classmethod
- def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool:
- column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index))
-
- accepted_structural_types = (int, float, numpy.integer, numpy.float64)
- accepted_semantic_types = set()
- accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute")
- if not issubclass(column_metadata['structural_type'], accepted_structural_types):
- return False
-
- semantic_types = set(column_metadata.get('semantic_types', []))
-
- if len(semantic_types) == 0:
- cls.logger.warning("No semantic types found in column metadata")
- return False
- # Making sure all accepted_semantic_types are available in semantic_types
- if len(accepted_semantic_types - semantic_types) == 0:
- return True
-
- return False
-
- @classmethod
- def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams):
- if not hyperparams['use_semantic_types']:
- return data, list(data.columns), list(range(len(data.columns)))
-
- metadata = data.metadata
-
- def can_produce_column(column_index: int) -> bool:
- accepted_semantic_types = set()
- accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget")
- column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index))
- semantic_types = set(column_metadata.get('semantic_types', []))
- if len(semantic_types) == 0:
- cls.logger.warning("No semantic types found in column metadata")
- return False
- # Making sure all accepted_semantic_types are available in semantic_types
- if len(accepted_semantic_types - semantic_types) == 0:
- return True
- return False
-
- target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata,
- use_columns=hyperparams[
- 'use_outputs_columns'],
- exclude_columns=
- hyperparams[
- 'exclude_outputs_columns'],
- can_use_column=can_produce_column)
- targets = []
- if target_column_indices:
- targets = data.select_columns(target_column_indices)
- target_column_names = []
- for idx in target_column_indices:
- target_column_names.append(data.columns[idx])
- return targets, target_column_names, target_column_indices
-
- @classmethod
- def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]:
- outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']
-
- target_columns_metadata: List[OrderedDict] = []
- for column_index in range(outputs_length):
- column_metadata = OrderedDict(outputs_metadata.query_column(column_index))
-
- # Update semantic types and prepare it for predicted targets.
- semantic_types = set(column_metadata.get('semantic_types', []))
- semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",])
- add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",])
- add_semantic_types.add(hyperparams["return_semantic_type"])
- semantic_types = semantic_types - semantic_types_to_remove
- semantic_types = semantic_types.union(add_semantic_types)
- column_metadata['semantic_types'] = list(semantic_types)
-
- target_columns_metadata.append(column_metadata)
-
- return target_columns_metadata
-
- @classmethod
- def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs],
- target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata:
- outputs_metadata = metadata_base.DataMetadata().generate(value=outputs)
-
- for column_index, column_metadata in enumerate(target_columns_metadata):
- column_metadata.pop("structural_type", None)
- outputs_metadata = outputs_metadata.update_column(column_index, column_metadata)
-
- return outputs_metadata
-
- def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs:
- outputs = d3m_dataframe(predictions, generate_metadata=False)
- outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata)
- return outputs
-
-
- @classmethod
- def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata):
- outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']
-
- target_columns_metadata: List[OrderedDict] = []
- for column_index in range(outputs_length):
- column_metadata = OrderedDict()
- semantic_types = []
- semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget')
- column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name")
- if column_name is None:
- column_name = "output_{}".format(column_index)
- column_metadata["semantic_types"] = semantic_types
- column_metadata["name"] = str(column_name)
- target_columns_metadata.append(column_metadata)
-
- return target_columns_metadata
-
-
- SKGaussianProcessRegressor.__doc__ = GaussianProcessRegressor.__doc__
|