Using the OPTaaS Python Client, you can optimize any scikit-learn pipeline. For each step or estimator in the pipeline, OPTaaS just needs to know what parameters to optimize and what constraints will apply to them.
Your pipeline can even include optional steps (such as feature selection), choice steps (such as choosing between a set of classifiers) and nested pipelines.
We have provided pre-defined parameters and constraints for some of the most widely used estimators, such as Random Forest and XGBoost. The example below demonstrates how to use them. See also our tutorial on defining your own custom optimizable estimators.
import pandas as pd
data = pd.read_csv('../../data/german_credit.csv')
features = data[data.columns.drop(['Creditability'])]
target = data['Creditability']
Our pipeline will include:
An optional feature selection step using PCA
A choice of classifier from: Random Forest, Extra Trees and Gradient Boost
from mindfoundry.optaas.client.sklearn_pipelines.estimators.pca import PCA
from mindfoundry.optaas.client.sklearn_pipelines.estimators.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from mindfoundry.optaas.client.sklearn_pipelines.mixin import OptimizablePipeline, choice, optional_step
optimizable_pipeline = OptimizablePipeline([
('feature_selection', optional_step(PCA())),
('classification', choice(
RandomForestClassifier(),
ExtraTreesClassifier(),
GradientBoostingClassifier()
))
])
We now create a client, and connect to the web service that will perform our optimization. You will need to input your personal API key. Make sure you keep your key private and don't commit it to your version control system.
from mindfoundry.optaas.client.client import OPTaaSClient
client = OPTaaSClient('https://optaas.mindfoundry.ai', '<Your OPTaaS API key>')
We don't need to worry about specifying all the parameters and constraints - they are generated based on our OptimizablePipeline. Sometimes we will need to provide additional kwargs, e.g. feature_count
which is required by PCA.
If we do need to optimize any additional parameters that are outside of our pipeline, we can include them in additional_parameters
and additional_constraints
.
from mindfoundry.optaas.client.parameter import IntParameter
from mindfoundry.optaas.client.constraint import Constraint
my_extra_param = IntParameter('extra', id='extra', minimum=0, maximum=10)
my_extra_constraint = Constraint(my_extra_param != 7)
task = client.create_sklearn_task(
title='My Sklearn Task',
pipeline=optimizable_pipeline,
feature_count=len(features.columns),
additional_parameters=[my_extra_param],
additional_constraints=[my_extra_constraint],
min_known_score=0, max_known_score=1 # optional: define the min and max known score values
)
display(task.parameters)
display(task.constraints)
[{'id': 'pipeline', 'name': 'pipeline', 'type': 'group', 'items': [{'id': 'pipeline__feature_selection', 'name': 'feature_selection', 'type': 'group', 'optional': True, 'items': [{'id': 'pipeline__feature_selection__n_components', 'name': 'n_components', 'type': 'integer', 'minimum': 1, 'maximum': 20}, {'id': 'pipeline__feature_selection__whiten', 'name': 'whiten', 'type': 'boolean', 'default': False}]}, {'id': 'classification', 'name': 'classification', 'type': 'choice', 'choices': [{'id': 'pipeline__classification__0', 'name': '0', 'type': 'group', 'items': [{'id': 'pipeline__classification__0__max_features', 'name': 'max_features', 'type': 'categorical', 'default': 'auto', 'enum': ['auto', 'sqrt', 'log2']}, {'id': 'pipeline__classification__0__min_samples_split', 'name': 'min_samples_split', 'type': 'integer', 'default': 2, 'minimum': 2, 'maximum': 20, 'distribution': 'Uniform'}, {'id': 'pipeline__classification__0__min_samples_leaf', 'name': 'min_samples_leaf', 'type': 'integer', 'default': 1, 'minimum': 1, 'maximum': 20}, {'id': 'pipeline__classification__0__criterion', 'name': 'criterion', 'type': 'categorical', 'default': 'gini', 'enum': ['gini', 'entropy']}, {'id': 'pipeline__classification__0__max_leaf_nodes', 'name': 'max_leaf_nodes', 'type': 'integer', 'optional': True, 'includeInDefault': False, 'minimum': 10, 'maximum': 10000, 'distribution': 'LogUniform'}, {'id': 'pipeline__classification__0__max_depth', 'name': 'max_depth', 'type': 'integer', 'optional': True, 'includeInDefault': False, 'minimum': 1, 'maximum': 100, 'distribution': 'LogUniform'}, {'id': 'pipeline__classification__0__min_weight_fraction_leaf', 'name': 'min_weight_fraction_leaf', 'type': 'number', 'default': 0.0, 'minimum': 0.0, 'maximum': 0.5}, {'id': 'pipeline__classification__0__min_impurity_decrease', 'name': 'min_impurity_decrease', 'type': 'number', 'default': 0.0, 'minimum': 0, 'maximum': 1}, {'id': 'pipeline__classification__0__bootstrap', 'name': 'bootstrap', 'type': 'boolean', 'default': True}, {'id': 'pipeline__classification__0__n_estimators', 'name': 'n_estimators', 'type': 'integer', 'default': 10, 'minimum': 10, 'maximum': 500}]}, {'id': 'pipeline__classification__1', 'name': '1', 'type': 'group', 'items': [{'id': 'pipeline__classification__1__max_features', 'name': 'max_features', 'type': 'categorical', 'default': 'auto', 'enum': ['auto', 'sqrt', 'log2']}, {'id': 'pipeline__classification__1__min_samples_split', 'name': 'min_samples_split', 'type': 'integer', 'default': 2, 'minimum': 2, 'maximum': 20, 'distribution': 'LogUniform'}, {'id': 'pipeline__classification__1__min_samples_leaf', 'name': 'min_samples_leaf', 'type': 'integer', 'default': 1, 'minimum': 1, 'maximum': 20}, {'id': 'pipeline__classification__1__criterion', 'name': 'criterion', 'type': 'categorical', 'default': 'gini', 'enum': ['gini', 'entropy']}, {'id': 'pipeline__classification__1__max_leaf_nodes', 'name': 'max_leaf_nodes', 'type': 'integer', 'optional': True, 'includeInDefault': False, 'minimum': 10, 'maximum': 10000, 'distribution': 'LogUniform'}, {'id': 'pipeline__classification__1__max_depth', 'name': 'max_depth', 'type': 'integer', 'optional': True, 'includeInDefault': False, 'minimum': 1, 'maximum': 100, 'distribution': 'Uniform'}, {'id': 'pipeline__classification__1__min_weight_fraction_leaf', 'name': 'min_weight_fraction_leaf', 'type': 'number', 'default': 0.0, 'minimum': 0.0, 'maximum': 0.5}, {'id': 'pipeline__classification__1__min_impurity_decrease', 'name': 'min_impurity_decrease', 'type': 'number', 'default': 0.0, 'minimum': 0, 'maximum': 1}, {'id': 'pipeline__classification__1__bootstrap', 'name': 'bootstrap', 'type': 'boolean', 'default': False}, {'id': 'pipeline__classification__1__n_estimators', 'name': 'n_estimators', 'type': 'integer', 'default': 10, 'minimum': 10, 'maximum': 500}]}, {'id': 'pipeline__classification__2', 'name': '2', 'type': 'group', 'items': [{'id': 'pipeline__classification__2__max_features', 'name': 'max_features', 'type': 'categorical', 'enum': ['auto', 'sqrt', 'log2']}, {'id': 'pipeline__classification__2__min_samples_split', 'name': 'min_samples_split', 'type': 'integer', 'default': 2, 'minimum': 2, 'maximum': 20, 'distribution': 'Uniform'}, {'id': 'pipeline__classification__2__min_samples_leaf', 'name': 'min_samples_leaf', 'type': 'integer', 'default': 1, 'minimum': 1, 'maximum': 20}, {'id': 'pipeline__classification__2__criterion', 'name': 'criterion', 'type': 'categorical', 'default': 'friedman_mse', 'enum': ['mse', 'friedman_mse', 'mae']}, {'id': 'pipeline__classification__2__max_leaf_nodes', 'name': 'max_leaf_nodes', 'type': 'integer', 'optional': True, 'includeInDefault': False, 'minimum': 10, 'maximum': 10000, 'distribution': 'LogUniform'}, {'id': 'pipeline__classification__2__max_depth', 'name': 'max_depth', 'type': 'integer', 'optional': True, 'default': 3, 'minimum': 1, 'maximum': 100, 'distribution': 'Uniform'}, {'id': 'pipeline__classification__2__min_weight_fraction_leaf', 'name': 'min_weight_fraction_leaf', 'type': 'number', 'default': 0.0, 'minimum': 0.0, 'maximum': 0.5}, {'id': 'pipeline__classification__2__min_impurity_decrease', 'name': 'min_impurity_decrease', 'type': 'number', 'default': 0.0, 'minimum': 0, 'maximum': 1}, {'id': 'pipeline__classification__2__learning_rate', 'name': 'learning_rate', 'type': 'number', 'default': 0.1, 'minimum': 5e-324, 'maximum': 1}, {'id': 'pipeline__classification__2__n_estimators', 'name': 'n_estimators', 'type': 'integer', 'default': 100, 'minimum': 10, 'maximum': 500}, {'id': 'pipeline__classification__2__subsample', 'name': 'subsample', 'type': 'number', 'default': 1.0, 'minimum': 0, 'maximum': 1}]}]}]}, {'id': '2544445067952', 'name': 'additional', 'type': 'group', 'items': [{'id': 'extra', 'name': 'extra', 'type': 'integer', 'minimum': 0, 'maximum': 10}]}]
['#extra != 7']
We define a function to run our pipeline and calculate the mean score and variance:
from sklearn.model_selection import cross_val_score
def scoring_function(pipeline):
scores = cross_val_score(pipeline, features, target, scoring='f1_micro', cv=5)
return scores.mean(), scores.var()
We run the task for 20 iterations and review the results:
best_result = task.run(scoring_function, 20)
print("Best Result: ", best_result)
Running task "My Sklearn Task" for 20 iterations (or until score is 1.0 or better) Iteration: 0 Score: 0.7139774505043966 Variance: 0.0011199704714439302 Pipeline: Pipeline(memory=None, steps=[('feature_selection', PCA(copy=True, iterated_power='auto', n_components=10, random_state=None, svd_solver='auto', tol=0.0, whiten=False)), ('classification', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_lea...n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False))]) Iteration: 1 Score: 0.7429884974795155 Variance: 0.0013301807783011328 Pipeline: Pipeline(memory=None, steps=[('classification', GradientBoostingClassifier(criterion='friedman_mse', init=None, learning_rate=0.1, loss='deviance', max_depth=3, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, presort='auto', random_state=None, subsample=1.0, verbose=0, warm_start=False))]) Iteration: 2 Score: 0.6969754185323048 Variance: 0.000789097736972498 Pipeline: Pipeline(memory=None, steps=[('feature_selection', PCA(copy=True, iterated_power='auto', n_components=10, random_state=None, svd_solver='auto', tol=0.0, whiten=False)), ('classification', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=50, max_features='auto', max_leaf_...n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False))]) Iteration: 3 Score: 0.7449755144365922 Variance: 0.0014781269404066037 Pipeline: Pipeline(memory=None, steps=[('feature_selection', PCA(copy=True, iterated_power='auto', n_components=10, random_state=None, svd_solver='auto', tol=0.0, whiten=False)), ('classification', ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_...timators=10, n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False))]) Iteration: 4 Score: 0.7159884435333538 Variance: 0.0015111632909465863 Pipeline: Pipeline(memory=None, steps=[('classification', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=50, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False))]) Iteration: 5 Score: 0.7209964455473438 Variance: 0.0005489016935114717 Pipeline: Pipeline(memory=None, steps=[('feature_selection', PCA(copy=True, iterated_power='auto', n_components=10, random_state=None, svd_solver='auto', tol=0.0, whiten=False)), ('classification', ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini', max_depth=50, max_features='auto', max_leaf_no...timators=10, n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False))]) Iteration: 6 Score: 0.6999694305083527 Variance: 0.0009542203697347021 Pipeline: Pipeline(memory=None, steps=[('classification', ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False))]) Iteration: 7 Score: 0.7089844335353318 Variance: 0.0004593334249857999 Pipeline: Pipeline(memory=None, steps=[('classification', ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini', max_depth=50, max_features='auto', max_leaf_nodes=5005, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False))]) Iteration: 8 Score: 0.7079744415073757 Variance: 0.0006647942864664427 Pipeline: Pipeline(memory=None, steps=[('classification', ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=5005, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False))]) Iteration: 9 Score: 0.7279794764824704 Variance: 0.0017496840000436487 Pipeline: Pipeline(memory=None, steps=[('feature_selection', PCA(copy=True, iterated_power='auto', n_components=10, random_state=None, svd_solver='auto', tol=0.0, whiten=False)), ('classification', ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_...timators=10, n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False))]) Iteration: 10 Score: 0.7289804774834715 Variance: 0.00163495165607608 Pipeline: Pipeline(memory=None, steps=[('feature_selection', PCA(copy=True, iterated_power='auto', n_components=12, random_state=None, svd_solver='auto', tol=0.0, whiten=False)), ('classification', ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_...timators=10, n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False))]) Iteration: 11 Score: 0.7029784275293257 Variance: 0.0022902989142203427 Pipeline: Pipeline(memory=None, steps=[('feature_selection', PCA(copy=True, iterated_power='auto', n_components=10, random_state=None, svd_solver='auto', tol=0.0, whiten=False)), ('classification', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_lea...n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False))]) Iteration: 12 Score: 0.7209874545203886 Variance: 0.0010947255754004488 Pipeline: Pipeline(memory=None, steps=[('feature_selection', PCA(copy=True, iterated_power='auto', n_components=8, random_state=None, svd_solver='auto', tol=0.0, whiten=False)), ('classification', ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_n...timators=10, n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False))]) Iteration: 13 Score: 0.7159644674614735 Variance: 0.0017269709319498695 Pipeline: Pipeline(memory=None, steps=[('classification', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=50, max_features='auto', max_leaf_nodes=5005, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False))]) Iteration: 14 Score: 0.699999400598203 Variance: 1.7964125712735343e-07 Pipeline: Pipeline(memory=None, steps=[('feature_selection', PCA(copy=True, iterated_power='auto', n_components=11, random_state=None, svd_solver='auto', tol=0.0, whiten=False)), ('classification', ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='entropy', max_depth=None, max_features='auto', max_le..., n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False))]) Iteration: 15 Score: 0.6849873825921731 Variance: 0.0004643686436742 Pipeline: Pipeline(memory=None, steps=[('classification', GradientBoostingClassifier(criterion='friedman_mse', init=None, learning_rate=0.1, loss='deviance', max_depth=None, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samp... presort='auto', random_state=None, subsample=1.0, verbose=0, warm_start=False))]) Iteration: 16 Score: 0.7339884794974615 Variance: 0.001510748319642625 Pipeline: Pipeline(memory=None, steps=[('classification', GradientBoostingClassifier(criterion='friedman_mse', init=None, learning_rate=0.1, loss='deviance', max_depth=3, max_features='sqrt', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, presort='auto', random_state=None, subsample=1.0, verbose=0, warm_start=False))]) Iteration: 17 Score: 0.723010435585286 Variance: 0.0013184775096818067 Pipeline: Pipeline(memory=None, steps=[('feature_selection', PCA(copy=True, iterated_power='auto', n_components=10, random_state=None, svd_solver='auto', tol=0.0, whiten=False)), ('classification', ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini', max_depth=50, max_features='auto', max_leaf_no...timators=10, n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False))]) Iteration: 18 Score: 0.6789903676131223 Variance: 0.0005889760651514416 Pipeline: Pipeline(memory=None, steps=[('feature_selection', PCA(copy=True, iterated_power='auto', n_components=11, random_state=None, svd_solver='auto', tol=0.0, whiten=False)), ('classification', GradientBoostingClassifier(criterion='friedman_mse', init=None, learning_rate=0.9301432147001057, loss='deviance', ...uto', random_state=None, subsample=0.26687567221016584, verbose=0, warm_start=False))]) Iteration: 19 Score: 0.6979794165422907 Variance: 0.0004282720144984198 Pipeline: Pipeline(memory=None, steps=[('classification', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=5005, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False))]) Task Completed Best Result: { 'pipeline': Pipeline(memory=None, steps=[('feature_selection', PCA(copy=True, iterated_power='auto', n_components=10, random_state=None, svd_solver='auto', tol=0.0, whiten=False)), ('classification', ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_...timators=10, n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False))]), 'score': 0.7449755144365922, 'user_defined_data': None}