#!/usr/bin/env python # coding: utf-8 # # OPTaaS Scikit-learn Custom Optimizable Estimators # # ### Note: To run this notebook, you need an API Key. You can get one here. # # Using the OPTaaS Python Client, you can optimize any scikit-learn pipeline. For each step or estimator in the pipeline, OPTaaS just needs to know what parameters to optimize and what constraints will apply to them. # # We have provided pre-defined parameters and constraints for some of the most widely used estimators, such as Random Forest and XGBoost (there is a [tutorial on how to use them](06.%20Scikit-learn%20Pipelines.ipynb)), but you can easily optimize any estimator, whether or not it's part of the sklearn library. Here's an example: # ## Creating an Optimizable Estimator # # First we create a class that extends our `OptimizableBaseEstimator` mixin. You'll notice there is an abstract method that we will need to implement: # In[ ]: from mindfoundry.optaas.client.sklearn_pipelines.mixin import OptimizableBaseEstimator from mindfoundry.optaas.client.sklearn_pipelines.parameter_maker import SklearnParameterMaker from mindfoundry.optaas.client.sklearn_pipelines.utils import ParametersConstraintsAndPriorMeans class MyEstimator(OptimizableBaseEstimator): def make_parameters_constraints_and_prior_means(self, sk: SklearnParameterMaker, **kwargs) -> ParametersConstraintsAndPriorMeans: pass # ## Defining Parameters # # For each of our estimator's hyperparameters that we wish to optimize, we will create a corresponding OPTaaS parameter. # # The first argument to our method is a `SklearnParameterMaker`. We will use this to create our parameters, i.e. we call `sk.CategoricalParameter` instead of just `CategoricalParameter`. # # This will ensure that each parameter is automatically assigned a unique id and a default value, which is based on the values set in the estimator's constructor. The parameter name should be exactly the same as the name of the argument in our constructor: # In[ ]: class MyEstimator(OptimizableBaseEstimator): def __init__(self, cat_param='abc'): self.cat_param = cat_param def make_parameters_constraints_and_prior_means(self, sk: SklearnParameterMaker, **kwargs) -> ParametersConstraintsAndPriorMeans: return [ sk.CategoricalParameter("cat_param", values=['abc', 'def', 'ghi']) ], [], [] display(f"{MyEstimator()} (default = 'abc')") display(f"{MyEstimator(cat_param='def')} (default = 'def')") # Convenience methods and constants are provided to help us model all the different scenarios we might come across: # In[ ]: from mindfoundry.optaas.client.sklearn_pipelines.utils import SMALLEST_NUMBER_ABOVE_ZERO, LARGEST_NUMBER_BELOW_ONE class MyEstimator(OptimizableBaseEstimator): def make_parameters_constraints_and_prior_means(self, sk: SklearnParameterMaker, **kwargs) -> ParametersConstraintsAndPriorMeans: return [ # A float value in the range (0, 1) exclusive sk.FloatParameter("float_param", minimum=SMALLEST_NUMBER_ABOVE_ZERO, maximum=LARGEST_NUMBER_BELOW_ONE), # Either an integer or the string 'auto' (there is also FloatOrAuto for floats) sk.IntOrAuto("int_or_auto", minimum=0, maximum=10), # Either an integer or None sk.IntParameter("int_or_none", minimum=0, maximum=10, optional=True), # An integer or a float or a string sk.ChoiceParameter("multi_type_param", choices=[ sk.IntParameter("an_int", minimum=5, maximum=10), sk.FloatParameter("a_float", minimum=0, maximum=1), sk.CategoricalParameter("a_string", values=['abc', 'xyz']) ]), # A list of values, e.g. [1, 0.2, 'c'] sk.GroupParameter("list_of_stuff", items=[ sk.IntParameter("an_int", minimum=0, maximum=5), sk.FloatParameter("a_float", minimum=0, maximum=0.5), sk.CategoricalParameter("a_string", values=['a', 'b', 'c']), ]), # A dict value, e.g. {'alpha': 0.5, 'beta': 13} sk.DictParameter("dict_param", items=[ sk.FloatParameter('alpha', minimum=0.5, maximum=1), sk.IntParameter('beta', minimum=10, maximum=20) ]) ], [], [] # ## Additional arguments # # Some estimators require additional information in order to optimize their hyperparameters, e.g. for PCA and ICA we need to know how many features are in our dataset, so that we can set a maximum value for the `n_components` parameter. # # These arguments are provided when a Task is created, and they are made available here as `kwargs`. We can use the `get_required_kwarg` method to raise an error if an argument has not been provided: # In[ ]: from sklearn.decomposition import PCA as BasePCA class PCA(BasePCA, OptimizableBaseEstimator): def make_parameters_constraints_and_prior_means(self, sk: SklearnParameterMaker, **kwargs) -> ParametersConstraintsAndPriorMeans: feature_count = self.get_required_kwarg(kwargs, 'feature_count') max_n_components = feature_count - 1 if self.svd_solver == 'arpack' else feature_count return [ sk.IntParameter('n_components', minimum=1, maximum=max_n_components), sk.BoolParameter('whiten') ], [], [] # ## Constraints # # Where necessary, we can also implement some constraints to prevent OPTaaS from generating a configuration which our constructor would not accept: # In[ ]: from mindfoundry.optaas.client.constraint import Constraint class MyEstimator(OptimizableBaseEstimator): def __init__(self, cat_param='abc', bool_param=True): if cat_param == 'abc' and not bool_param: raise ValueError('Invalid combination of arguments') self.cat_param = cat_param self.bool_param = bool_param def make_parameters_constraints_and_prior_means(self, sk: SklearnParameterMaker, **kwargs) -> ParametersConstraintsAndPriorMeans: cat_param = sk.CategoricalParameter("cat_param", values=['abc', 'def', 'ghi']) bool_param = sk.BoolParameter("bool_param") constraint = Constraint(when=cat_param == 'abc', then=bool_param == True) return [cat_param, bool_param], [constraint], [] def fit(): pass # ## Creating a Task # # We now create a task using our new estimator. As you can see, all the parameters and constraints have been generated as expected, and the defaults have been set. # In[ ]: from mindfoundry.optaas.client.client import OPTaaSClient from mindfoundry.optaas.client.sklearn_pipelines.mixin import OptimizablePipeline client = OPTaaSClient('https://optaas.mindfoundry.ai', '') task = client.create_sklearn_task( title='My Custom Estimator Task', pipeline=OptimizablePipeline([ ('pca', PCA(svd_solver='arpack')), ('my estimator', MyEstimator(cat_param='def')) ]), feature_count=20 ) display(task.parameters, task.constraints) # ## Generating pipelines # # We can now generate some configurations for our task and use them to create pipelines: # In[ ]: for configuration in task.generate_configurations(5): display(task.make_pipeline(configuration)) # ## Optional Estimators # # Any estimator can be an optional step in a pipeline by simply calling `optional_step(estimator)` as demonstrated [here](Scikit-learn Pipelines.ipynb). # # However, if you want your estimator to **always** be optional, you can simply use the `OptionalStepMixin`: # In[ ]: from mindfoundry.optaas.client.sklearn_pipelines.mixin import OptionalStepMixin class MyOptionalEstimator(MyEstimator, OptionalStepMixin): pass