|
- from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple
- from numpy import ndarray
- from collections import OrderedDict
- from scipy import sparse
- import os
- import sklearn
- import numpy
- import typing
-
- # Custom import commands if any
- from sklearn.tree.tree import DecisionTreeClassifier
- import numpy
-
-
- from d3m.container.numpy import ndarray as d3m_ndarray
- from d3m.container import DataFrame as d3m_dataframe
- from d3m.metadata import hyperparams, params, base as metadata_base
- from d3m import utils
- from d3m.base import utils as base_utils
- from d3m.exceptions import PrimitiveNotFittedError
- from d3m.primitive_interfaces.base import CallResult, DockerContainer
-
- from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase
- from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin
- from d3m import exceptions
- import pandas
-
-
-
- Inputs = d3m_dataframe
- Outputs = d3m_dataframe
-
-
- class Params(params.Params):
- classes_: Optional[Union[ndarray, List[ndarray]]]
- max_features_: Optional[int]
- n_classes_: Optional[Union[numpy.int64, List[numpy.int64]]]
- n_features_: Optional[int]
- n_outputs_: Optional[int]
- tree_: Optional[object]
- input_column_names: Optional[Any]
- target_names_: Optional[Sequence[Any]]
- training_indices_: Optional[Sequence[int]]
- target_column_indices_: Optional[Sequence[int]]
- target_columns_metadata_: Optional[List[OrderedDict]]
-
-
-
- class Hyperparams(hyperparams.Hyperparams):
- criterion = hyperparams.Enumeration[str](
- values=['gini', 'entropy'],
- default='gini',
- description='The function to measure the quality of a split. Supported criteria are "gini" for the Gini impurity and "entropy" for the information gain.',
- semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
- )
- splitter = hyperparams.Enumeration[str](
- values=['best', 'random'],
- default='best',
- description='The strategy used to choose the split at each node. Supported strategies are "best" to choose the best split and "random" to choose the best random split.',
- semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
- )
- max_depth = hyperparams.Union(
- configuration=OrderedDict({
- 'int': hyperparams.Bounded[int](
- default=10,
- lower=0,
- upper=None,
- semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
- ),
- 'none': hyperparams.Constant(
- default=None,
- semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
- )
- }),
- default='none',
- description='The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.',
- semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
- )
- min_samples_split = hyperparams.Union(
- configuration=OrderedDict({
- 'absolute': hyperparams.Bounded[int](
- default=2,
- lower=1,
- upper=None,
- semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
- ),
- 'percent': hyperparams.Bounded[float](
- default=0.25,
- lower=0,
- upper=1,
- semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
- )
- }),
- default='absolute',
- description='The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a percentage and `ceil(min_samples_split * n_samples)` are the minimum number of samples for each split. .. versionchanged:: 0.18 Added float values for percentages.',
- semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
- )
- min_samples_leaf = hyperparams.Union(
- configuration=OrderedDict({
- 'absolute': hyperparams.Bounded[int](
- default=1,
- lower=1,
- upper=None,
- semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
- ),
- 'percent': hyperparams.Bounded[float](
- default=0.25,
- lower=0,
- upper=0.5,
- semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
- )
- }),
- default='absolute',
- description='The minimum number of samples required to be at a leaf node: - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a percentage and `ceil(min_samples_leaf * n_samples)` are the minimum number of samples for each node. .. versionchanged:: 0.18 Added float values for percentages.',
- semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
- )
- min_weight_fraction_leaf = hyperparams.Bounded[float](
- default=0,
- lower=0,
- upper=0.5,
- description='The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.',
- semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
- )
- max_leaf_nodes = hyperparams.Union(
- configuration=OrderedDict({
- 'int': hyperparams.Bounded[int](
- lower=0,
- upper=None,
- default=0,
- semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
- ),
- 'none': hyperparams.Constant(
- default=None,
- semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
- )
- }),
- default='none',
- description='Grow a tree with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.',
- semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
- )
- max_features = hyperparams.Union(
- configuration=OrderedDict({
- 'specified_int': hyperparams.Bounded[int](
- lower=0,
- upper=None,
- default=0,
- semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
- ),
- 'calculated': hyperparams.Enumeration[str](
- values=['auto', 'sqrt', 'log2'],
- default='auto',
- semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
- ),
- 'none': hyperparams.Constant(
- default=None,
- semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
- ),
- 'percent': hyperparams.Bounded[float](
- default=0.25,
- lower=0,
- upper=1,
- semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
- )
- }),
- default='none',
- description='The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a percentage and `int(max_features * n_features)` features are considered at each split. - If "auto", then `max_features=sqrt(n_features)`. - If "sqrt", then `max_features=sqrt(n_features)`. - If "log2", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.',
- semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
- )
- min_impurity_decrease = hyperparams.Bounded[float](
- default=0.0,
- lower=0.0,
- upper=None,
- description='A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following:: N_t / N * (impurity - N_t_R / N_t * right_impurity - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19 ',
- semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
- )
- class_weight = hyperparams.Union(
- configuration=OrderedDict({
- 'str': hyperparams.Constant(
- default='balanced',
- semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
- ),
- 'none': hyperparams.Constant(
- default=None,
- semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
- )
- }),
- default='none',
- description='Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. For multi-output problems, a list of dicts can be provided in the same order as the columns of y. The "balanced" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))`` For multi-output, the weights of each column of y will be multiplied. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified.',
- semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
- )
- presort = hyperparams.UniformBool(
- default=False,
- description='Whether to presort the data to speed up the finding of best splits in fitting. For the default settings of a decision tree on large datasets, setting this to true may slow down the training process. When using either a smaller dataset or a restricted depth, this may speed up the training.',
- semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter']
- )
-
- use_inputs_columns = hyperparams.Set(
- elements=hyperparams.Hyperparameter[int](-1),
- default=(),
- semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
- description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.",
- )
- use_outputs_columns = hyperparams.Set(
- elements=hyperparams.Hyperparameter[int](-1),
- default=(),
- semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
- description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.",
- )
- exclude_inputs_columns = hyperparams.Set(
- elements=hyperparams.Hyperparameter[int](-1),
- default=(),
- semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
- description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.",
- )
- exclude_outputs_columns = hyperparams.Set(
- elements=hyperparams.Hyperparameter[int](-1),
- default=(),
- semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
- description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.",
- )
- return_result = hyperparams.Enumeration(
- values=['append', 'replace', 'new'],
- default='new',
- semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
- description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.",
- )
- use_semantic_types = hyperparams.UniformBool(
- default=False,
- semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
- description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe"
- )
- add_index_columns = hyperparams.UniformBool(
- default=False,
- semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
- description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".",
- )
- error_on_no_input = hyperparams.UniformBool(
- default=True,
- semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
- description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.",
- )
-
- return_semantic_type = hyperparams.Enumeration[str](
- values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'],
- default='https://metadata.datadrivendiscovery.org/types/PredictedTarget',
- description='Decides what semantic type to attach to generated output',
- semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter']
- )
-
- class SKDecisionTreeClassifier(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams],
- ProbabilisticCompositionalityMixin[Inputs, Outputs, Params, Hyperparams]):
- """
- Primitive wrapping for sklearn DecisionTreeClassifier
- `sklearn documentation <https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html>`_
-
- """
-
- __author__ = "JPL MARVIN"
- metadata = metadata_base.PrimitiveMetadata({
- "algorithm_types": [metadata_base.PrimitiveAlgorithmType.DECISION_TREE, ],
- "name": "sklearn.tree.tree.DecisionTreeClassifier",
- "primitive_family": metadata_base.PrimitiveFamily.CLASSIFICATION,
- "python_path": "d3m.primitives.classification.decision_tree.SKlearn",
- "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html']},
- "version": "2019.11.13",
- "id": "e20d003d-6a9f-35b0-b4b5-20e42b30282a",
- "hyperparams_to_tune": ['max_depth', 'min_samples_split', 'min_samples_leaf', 'max_features'],
- 'installation': [
- {'type': metadata_base.PrimitiveInstallationType.PIP,
- 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format(
- git_commit=utils.current_git_commit(os.path.dirname(__file__)),
- ),
- }]
- })
-
- def __init__(self, *,
- hyperparams: Hyperparams,
- random_seed: int = 0,
- docker_containers: Dict[str, DockerContainer] = None) -> None:
-
- super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers)
-
- # False
- self._clf = DecisionTreeClassifier(
- criterion=self.hyperparams['criterion'],
- splitter=self.hyperparams['splitter'],
- max_depth=self.hyperparams['max_depth'],
- min_samples_split=self.hyperparams['min_samples_split'],
- min_samples_leaf=self.hyperparams['min_samples_leaf'],
- min_weight_fraction_leaf=self.hyperparams['min_weight_fraction_leaf'],
- max_leaf_nodes=self.hyperparams['max_leaf_nodes'],
- max_features=self.hyperparams['max_features'],
- min_impurity_decrease=self.hyperparams['min_impurity_decrease'],
- class_weight=self.hyperparams['class_weight'],
- presort=self.hyperparams['presort'],
- random_state=self.random_seed,
- )
-
- self._inputs = None
- self._outputs = None
- self._training_inputs = None
- self._training_outputs = None
- self._target_names = None
- self._training_indices = None
- self._target_column_indices = None
- self._target_columns_metadata: List[OrderedDict] = None
- self._input_column_names = None
- self._fitted = False
- self._new_training_data = False
-
- def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
- self._inputs = inputs
- self._outputs = outputs
- self._fitted = False
- self._new_training_data = True
-
- def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
- if self._inputs is None or self._outputs is None:
- raise ValueError("Missing training data.")
-
- if not self._new_training_data:
- return CallResult(None)
- self._new_training_data = False
-
- self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams)
- self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams)
- self._input_column_names = self._training_inputs.columns
-
- if len(self._training_indices) > 0 and len(self._target_column_indices) > 0:
- self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams)
- sk_training_output = self._training_outputs.values
-
- shape = sk_training_output.shape
- if len(shape) == 2 and shape[1] == 1:
- sk_training_output = numpy.ravel(sk_training_output)
-
- self._clf.fit(self._training_inputs, sk_training_output)
- self._fitted = True
- else:
- if self.hyperparams['error_on_no_input']:
- raise RuntimeError("No input columns were selected")
- self.logger.warn("No input columns were selected")
-
- return CallResult(None)
-
-
-
- def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
- sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams)
- output = []
- if len(sk_inputs.columns):
- try:
- sk_output = self._clf.predict(sk_inputs)
- except sklearn.exceptions.NotFittedError as error:
- raise PrimitiveNotFittedError("Primitive not fitted.") from error
- # For primitives that allow predicting without fitting like GaussianProcessRegressor
- if not self._fitted:
- raise PrimitiveNotFittedError("Primitive not fitted.")
- if sparse.issparse(sk_output):
- sk_output = sk_output.toarray()
- output = self._wrap_predictions(inputs, sk_output)
- output.columns = self._target_names
- output = [output]
- else:
- if self.hyperparams['error_on_no_input']:
- raise RuntimeError("No input columns were selected")
- self.logger.warn("No input columns were selected")
- outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'],
- add_index_columns=self.hyperparams['add_index_columns'],
- inputs=inputs, column_indices=self._target_column_indices,
- columns_list=output)
-
- return CallResult(outputs)
-
-
- def get_params(self) -> Params:
- if not self._fitted:
- return Params(
- classes_=None,
- max_features_=None,
- n_classes_=None,
- n_features_=None,
- n_outputs_=None,
- tree_=None,
- input_column_names=self._input_column_names,
- training_indices_=self._training_indices,
- target_names_=self._target_names,
- target_column_indices_=self._target_column_indices,
- target_columns_metadata_=self._target_columns_metadata
- )
-
- return Params(
- classes_=getattr(self._clf, 'classes_', None),
- max_features_=getattr(self._clf, 'max_features_', None),
- n_classes_=getattr(self._clf, 'n_classes_', None),
- n_features_=getattr(self._clf, 'n_features_', None),
- n_outputs_=getattr(self._clf, 'n_outputs_', None),
- tree_=getattr(self._clf, 'tree_', None),
- input_column_names=self._input_column_names,
- training_indices_=self._training_indices,
- target_names_=self._target_names,
- target_column_indices_=self._target_column_indices,
- target_columns_metadata_=self._target_columns_metadata
- )
-
- def set_params(self, *, params: Params) -> None:
- self._clf.classes_ = params['classes_']
- self._clf.max_features_ = params['max_features_']
- self._clf.n_classes_ = params['n_classes_']
- self._clf.n_features_ = params['n_features_']
- self._clf.n_outputs_ = params['n_outputs_']
- self._clf.tree_ = params['tree_']
- self._input_column_names = params['input_column_names']
- self._training_indices = params['training_indices_']
- self._target_names = params['target_names_']
- self._target_column_indices = params['target_column_indices_']
- self._target_columns_metadata = params['target_columns_metadata_']
-
- if params['classes_'] is not None:
- self._fitted = True
- if params['max_features_'] is not None:
- self._fitted = True
- if params['n_classes_'] is not None:
- self._fitted = True
- if params['n_features_'] is not None:
- self._fitted = True
- if params['n_outputs_'] is not None:
- self._fitted = True
- if params['tree_'] is not None:
- self._fitted = True
-
-
- def log_likelihoods(self, *,
- outputs: Outputs,
- inputs: Inputs,
- timeout: float = None,
- iterations: int = None) -> CallResult[Sequence[float]]:
- inputs = inputs.iloc[:, self._training_indices] # Get ndarray
- outputs = outputs.iloc[:, self._target_column_indices]
-
- if len(inputs.columns) and len(outputs.columns):
-
- if outputs.shape[1] != self._clf.n_outputs_:
- raise exceptions.InvalidArgumentValueError("\"outputs\" argument does not have the correct number of target columns.")
-
- log_proba = self._clf.predict_log_proba(inputs)
-
- # Making it always a list, even when only one target.
- if self._clf.n_outputs_ == 1:
- log_proba = [log_proba]
- classes = [self._clf.classes_]
- else:
- classes = self._clf.classes_
-
- samples_length = inputs.shape[0]
-
- log_likelihoods = []
- for k in range(self._clf.n_outputs_):
- # We have to map each class to its internal (numerical) index used in the learner.
- # This allows "outputs" to contain string classes.
- outputs_column = outputs.iloc[:, k]
- classes_map = pandas.Series(numpy.arange(len(classes[k])), index=classes[k])
- mapped_outputs_column = outputs_column.map(classes_map)
-
- # For each target column (column in "outputs"), for each sample (row) we pick the log
- # likelihood for a given class.
- log_likelihoods.append(log_proba[k][numpy.arange(samples_length), mapped_outputs_column])
-
- results = d3m_dataframe(dict(enumerate(log_likelihoods)), generate_metadata=True)
- results.columns = outputs.columns
-
- for k in range(self._clf.n_outputs_):
- column_metadata = outputs.metadata.query_column(k)
- if 'name' in column_metadata:
- results.metadata = results.metadata.update_column(k, {'name': column_metadata['name']})
-
- else:
- results = d3m_dataframe(generate_metadata=True)
-
- return CallResult(results)
-
-
-
- def produce_feature_importances(self, *, timeout: float = None, iterations: int = None) -> CallResult[d3m_dataframe]:
- output = d3m_dataframe(self._clf.feature_importances_.reshape((1, len(self._input_column_names))))
- output.columns = self._input_column_names
- for i in range(len(self._input_column_names)):
- output.metadata = output.metadata.update_column(i, {"name": self._input_column_names[i]})
- return CallResult(output)
-
- @classmethod
- def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams):
- if not hyperparams['use_semantic_types']:
- return inputs, list(range(len(inputs.columns)))
-
- inputs_metadata = inputs.metadata
-
- def can_produce_column(column_index: int) -> bool:
- return cls._can_produce_column(inputs_metadata, column_index, hyperparams)
-
- columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata,
- use_columns=hyperparams['use_inputs_columns'],
- exclude_columns=hyperparams['exclude_inputs_columns'],
- can_use_column=can_produce_column)
- return inputs.iloc[:, columns_to_produce], columns_to_produce
- # return columns_to_produce
-
- @classmethod
- def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool:
- column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index))
-
- accepted_structural_types = (int, float, numpy.integer, numpy.float64)
- accepted_semantic_types = set()
- accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute")
- if not issubclass(column_metadata['structural_type'], accepted_structural_types):
- return False
-
- semantic_types = set(column_metadata.get('semantic_types', []))
-
- if len(semantic_types) == 0:
- cls.logger.warning("No semantic types found in column metadata")
- return False
- # Making sure all accepted_semantic_types are available in semantic_types
- if len(accepted_semantic_types - semantic_types) == 0:
- return True
-
- return False
-
- @classmethod
- def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams):
- if not hyperparams['use_semantic_types']:
- return data, list(data.columns), list(range(len(data.columns)))
-
- metadata = data.metadata
-
- def can_produce_column(column_index: int) -> bool:
- accepted_semantic_types = set()
- accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget")
- column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index))
- semantic_types = set(column_metadata.get('semantic_types', []))
- if len(semantic_types) == 0:
- cls.logger.warning("No semantic types found in column metadata")
- return False
- # Making sure all accepted_semantic_types are available in semantic_types
- if len(accepted_semantic_types - semantic_types) == 0:
- return True
- return False
-
- target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata,
- use_columns=hyperparams[
- 'use_outputs_columns'],
- exclude_columns=
- hyperparams[
- 'exclude_outputs_columns'],
- can_use_column=can_produce_column)
- targets = []
- if target_column_indices:
- targets = data.select_columns(target_column_indices)
- target_column_names = []
- for idx in target_column_indices:
- target_column_names.append(data.columns[idx])
- return targets, target_column_names, target_column_indices
-
- @classmethod
- def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]:
- outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']
-
- target_columns_metadata: List[OrderedDict] = []
- for column_index in range(outputs_length):
- column_metadata = OrderedDict(outputs_metadata.query_column(column_index))
-
- # Update semantic types and prepare it for predicted targets.
- semantic_types = set(column_metadata.get('semantic_types', []))
- semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",])
- add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",])
- add_semantic_types.add(hyperparams["return_semantic_type"])
- semantic_types = semantic_types - semantic_types_to_remove
- semantic_types = semantic_types.union(add_semantic_types)
- column_metadata['semantic_types'] = list(semantic_types)
-
- target_columns_metadata.append(column_metadata)
-
- return target_columns_metadata
-
- @classmethod
- def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs],
- target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata:
- outputs_metadata = metadata_base.DataMetadata().generate(value=outputs)
-
- for column_index, column_metadata in enumerate(target_columns_metadata):
- column_metadata.pop("structural_type", None)
- outputs_metadata = outputs_metadata.update_column(column_index, column_metadata)
-
- return outputs_metadata
-
- def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs:
- outputs = d3m_dataframe(predictions, generate_metadata=False)
- outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata)
- return outputs
-
-
- @classmethod
- def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata):
- outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']
-
- target_columns_metadata: List[OrderedDict] = []
- for column_index in range(outputs_length):
- column_metadata = OrderedDict()
- semantic_types = []
- semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget')
- column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name")
- if column_name is None:
- column_name = "output_{}".format(column_index)
- column_metadata["semantic_types"] = semantic_types
- column_metadata["name"] = str(column_name)
- target_columns_metadata.append(column_metadata)
-
- return target_columns_metadata
-
-
- SKDecisionTreeClassifier.__doc__ = DecisionTreeClassifier.__doc__
|