Source code for foreshadow.estimators.auto

"""AutoEstimator."""

import warnings

import numpy as np

from foreshadow.base import BaseEstimator
from foreshadow.estimators.config import get_tpot_config
from foreshadow.serializers import ConcreteSerializerMixin
from foreshadow.utils import check_df, check_module_installed


[docs]class AutoEstimator(BaseEstimator, ConcreteSerializerMixin): """A wrapped estimator that selects the solution for a given problem. By default each automatic machine learning solution runs for 1 minute but that can be changed through passed kwargs. Autosklearn is not required for this to work but if installed it can be used alongside TPOT. Args: problem_type (str): The problem type, 'regression' or 'classification' auto (str): The automatic estimator, 'tpot' or 'autosklearn' include_preprocessors (bool): Whether include preprocessors in AutoML pipelines estimator_kwargs (dict): A dictionary of args to pass to the specified auto estimator (both problem_type and auto must be specified) """ def __init__( self, problem_type=None, auto=None, include_preprocessors=False, estimator_kwargs=None, ): self.problem_type = problem_type self.auto = auto self.include_preprocessors = include_preprocessors self.estimator_kwargs = estimator_kwargs self.estimator_class = None self.estimator = None @property def problem_type(self): """Type of machine learning problem. Either `regression` or `classification`. Returns: self._problem_type """ return self._problem_type @problem_type.setter def problem_type(self, pt): """Set the problem type. Args: pt: problem type Raises: ValueError: pt not valid problem type """ pt_options = ["classification", "regression"] if pt is not None and pt not in pt_options: raise ValueError("problem type must be in {}".format(pt_options)) self._problem_type = pt @property def auto(self): """Type of automl package. Either `tpot` or `autosklearn`. Returns: self._auto, the type of automl package """ return self._auto @auto.setter def auto(self, ae): """Set type of automl package. Args: ae: automl packaage Raises: ValueError: ae not a valid ae """ ae_options = ["tpot", "autosklearn"] if ae is not None and ae not in ae_options: raise ValueError("auto must be in {}".format(ae_options)) self._auto = ae @property def estimator_kwargs(self): """Get dictionary of kwargs to pass to AutoML package. Returns: estimator kwargs """ return self._estimator_kwargs @estimator_kwargs.setter def estimator_kwargs(self, ek): """Set estimator kwargs. Args: ek: kwargs Raises: ValueError: ek not a valid kwargs dict. """ if ek is not None and ek is not {}: if not isinstance(ek, dict) or not all( isinstance(k, str) for k in ek.keys() ): raise ValueError( "estimator_kwargs must be a valid kwarg dictionary" ) self._estimator_kwargs = ek else: self._estimator_kwargs = {} def _get_optimal_estimator_class(self): """Pick the optimal estimator class. Defaults to a working estimator if autosklearn is not installed. Returns: optimal estimator class. """ auto_ = self._pick_estimator() if self.auto is None else self.auto estimator_choices = { "autosklearn": { "classification": "AutoSklearnClassifier", "regression": "AutoSklearnRegressor", }, "tpot": { "classification": "TPOTClassifier", "regression": "TPOTRegressor", }, } if not check_module_installed(auto_): selected_auto = "tpot" warnings.warn( "{} is not available, defaulting to {}".format( auto_, selected_auto ) ) self.auto = selected_auto else: self.auto = auto_ if self.auto == "autosklearn": import autosklearn.classification import autosklearn.regression class_ = estimator_choices[self.auto][self.problem_type] if class_ == "AutoSklearnClassifier": return getattr(autosklearn.classification, class_) else: return getattr(autosklearn.regression, class_) else: import tpot return getattr( tpot, estimator_choices[self.auto][self.problem_type] ) def _pick_estimator(self): """Pick auto estimator based on benchmarked results. Returns: estimator """ return "tpot" if self.problem_type == "regression" else "autosklearn" def _pre_configure_estimator_kwargs(self): """Configure auto estimators to perform similarly (time scale). Also remove preprocessors if necessary. Returns: estimator kwargs """ if self.auto == "tpot" and "config_dict" not in self.estimator_kwargs: self.estimator_kwargs["config_dict"] = get_tpot_config( self.problem_type, self.include_preprocessors ) if "max_time_mins" not in self.estimator_kwargs: self.estimator_kwargs["max_time_mins"] = 5 if "verbosity" not in self.estimator_kwargs: self.estimator_kwargs["verbosity"] = 3 elif ( self.auto == "autosklearn" and not any( k in self.estimator_kwargs for k in ["include_preprocessors", "exclude_preprocessors"] ) and not self.include_preprocessors ): self.estimator_kwargs["include_preprocessors"] = [ "no_preprocessing" ] if ( self.auto == "autosklearn" and "time_left_for_this_task" not in self.estimator_kwargs ): self.estimator_kwargs["time_left_for_this_task"] = 360 return self.estimator_kwargs
[docs] def configure_estimator(self, y): """Construct and return the auto estimator instance. Args: y: input labels Returns: autoestimator instance """ self.problem_type = ( determine_problem_type(y) if self.problem_type is None else self.problem_type ) self.estimator_class = ( self._get_optimal_estimator_class() ) # update estimator class in case of autodetect self._pre_configure_estimator_kwargs() # validate estimator kwargs return self.estimator_class(**self.estimator_kwargs)
[docs] def fit(self, X, y): """Fit the AutoEstimator instance. Uses the selected AutoML estimator. Args: X (pandas.DataFrame or numpy.ndarray or list): The input feature(s) y (pandas.DataFrame or numpy.ndarray or list): The response feature(s) Returns: The selected estimator """ X = check_df(X) y = check_df(y) self.estimator = self.configure_estimator(y) self.estimator.fit(X, y) return self.estimator
[docs] def predict(self, X): """Use the trained estimator to predict the response. Args: X (pandas.DataFrame or numpy.ndarray or list): The input feature(s) Returns: pandas.DataFrame: The response feature(s) """ X = check_df(X) return self.estimator.predict(X)
[docs] def predict_proba(self, X): # pragma: no cover """Use the trained estimator to predict the responses probabilities. Args: X (pandas.DataFrame or numpy.ndarray or list): The input feature(s) Returns: pandas.DataFrame: The probability associated with each response \ feature """ X = check_df(X) return self.estimator.predict_proba(X)
[docs] def score(self, X, y, sample_weight=None): """Use the trained estimator to compute the evaluation score. Note: sample weights are not supported Args: X (pandas.DataFrame or numpy.ndarray or list): The input feature(s) y (pandas.DataFrame or numpy.ndarray or list): The response feature(s) sample_weight: sample weighting. Not implemented. Returns: float: A computed prediction fitness score """ X = check_df(X) y = check_df(y) return self.estimator.score(X, y)
def determine_problem_type(y): """Determine modeling problem type. Args: y: input labels Returns: problem type inferred from y. """ return ( "classification" if np.unique(y.values.ravel()).size == 2 else "regression" )